Re: [PATCH v3 1/3] pidfd: allow pidfd_open() on non-thread-group leaders

From: Oleg Nesterov
Date: Thu Jan 25 2024 - 09:10:04 EST


Add Eric.

On 01/23, Tycho Andersen wrote:
>
> On Tue, Jan 23, 2024 at 08:56:08PM +0100, Oleg Nesterov wrote:
>
> > So. When do we want to do do_notify_pidfd() ? Whe the task (leader or not)
> > becomes a zombie (passes exit_notify) or when it is reaped by release_task?
>
> It seems like we'd want it when exit_notify() is called in principle,
> since that's when the pid actually dies.

No the pid "dies" after this task is reaped, until then its nr is still
in use and pid_task(PIDTYPE_PID) returns the exiting/exited task.

> When it is reaped is "mostly unrelated".

Then why pidfd_poll() can't simply check !task || task->exit_state ?

Nevermind. So, currently pidfd_poll() succeeds when the leader can be
reaped, iow the whole thread group has exited. But even if you are the
parent, you can't expect that wait(WNOHANG) must succeed, the leader
can be traced. I guess it is too late to change this behaviour.

What if we add the new PIDFD_THREAD flag? With this flag

- sys_pidfd_open() doesn't require the must be a group leader

- pidfd_poll() succeeds when the task passes exit_notify() and
becomes a zombie, even if it is a leader and has other threads.


Please the the incomplete/untested patch below.

- The change in exit_notify() is sub-optimal, we can do better
to avoid 2 do_notify_pidfd() calls from exit_notify(). But
so far this is only for discussion, lets keep it simple.

- __pidfd_prepare() needs some minor cleanups regardless of
this change, I'll send the patch...

What do you think?

And why is thread_group_exited() exported?

Oleg.

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 5406fbc13074..2e6461459877 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -7,6 +7,7 @@
#include <linux/fcntl.h>

/* Flags for pidfd_open(). */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_THREAD O_EXCL // or anything else not used by anon_inode's

#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index dfb963d2f862..9f8526b7d717 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -752,6 +752,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
autoreap = true;
}

+ /* unnecessary if do_notify_parent() was already called,
+ we can do better */
+ do_notify_pidfd(tsk);
+
if (autoreap) {
tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index c981fa6171c1..38f2c7423fb4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,7 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -2068,12 +2069,27 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif

+static bool xxx_exited(struct pid *pid, int excl)
+{
+ struct task_struct *task;
+ bool exited;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ exited = !task ||
+ (READ_ONCE(task->exit_state) && (excl || thread_group_empty(task)));
+ rcu_read_unlock();
+
+ return exited;
+}
+
/*
* Poll support for process exit notification.
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = file->private_data;
+ int excl = file->f_flags & PIDFD_THREAD;
__poll_t poll_flags = 0;

poll_wait(file, &pid->wait_pidfd, pts);
@@ -2083,7 +2099,7 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
- if (thread_group_exited(pid))
+ if (xxx_exited(pid, excl))
poll_flags = EPOLLIN | EPOLLRDNORM;

return poll_flags;
@@ -2129,7 +2145,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
{
int pidfd;
struct file *pidfd_file;
+ unsigned excl = flags & PIDFD_THREAD;

+ flags &= ~PIDFD_THREAD;
if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
return -EINVAL;

@@ -2144,6 +2162,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
return PTR_ERR(pidfd_file);
}
get_pid(pid); /* held by pidfd_file now */
+ pidfd_file->f_flags |= excl;
*ret = pidfd_file;
return pidfd;
}
@@ -2176,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ unsigned excl = flags & PIDFD_THREAD;
+
+ if (!pid || !pid_has_task(pid, excl ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;

return __pidfd_prepare(pid, flags, ret);
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b10865454..5257197f9493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -629,7 +629,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;

- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;

if (pid <= 0)