Re: [PATCH v2] pidfd: implement PIDFD_THREAD flag for pidfd_open()

From: Oleg Nesterov
Date: Tue Jan 30 2024 - 06:35:54 EST


Damn. Self-NACK.

I forgot (we all ;) about mt-exec, and there are 2 problems.

1. The "if (!thread_group_leader(tsk))" block in de_thread() needs
do_notify_pidfd() too, the execing non-leader thread looses its
old pid, pidfd_poll(PIDFD_THREAD, pid-of-execing-sub-thread)
should succeed. Must be fixed, I think.

2. pidfd_poll(PIDFD_THREAD, pid-of-group-leader) should not succeed
when its sub-thread execs, the execing thread inherits the leader's
pid. Perhaps pidfd_task_exited() can check sig->group_exec_task,
but:

OTOH. do we really care? Group leader can exit and become a
zombie. Then another thread can exec. Perhaps we can simply
emphasize that if you use pidfd_open(leader-pid, PIDFD_THREAD)
then you shouldn't expect that after poll() this pid can't be
"vivified" again? I don't think this can be "fixed".

I'll try to think more. We don't want to take tasklist in poll...

Oleg.

On 01/30, Oleg Nesterov wrote:
>
> With this flag:
>
> - pidfd_open() doesn't require that the target task must be
> a thread-group leader
>
> - pidfd_poll() succeeds when the task exits and becomes a
> zombie (iow, passes exit_notify()), even if it is a leader
> and thread-group is not empty.
>
> thread_group_exited() is no longer used, perhaps it can die.
>
> Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
> Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
> ---
> include/uapi/linux/pidfd.h | 3 ++-
> kernel/exit.c | 7 +++++++
> kernel/fork.c | 38 +++++++++++++++++++++++++++++++-------
> kernel/pid.c | 14 +++-----------
> kernel/signal.c | 4 +++-
> 5 files changed, 46 insertions(+), 20 deletions(-)
>
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 5406fbc13074..2e6461459877 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -7,6 +7,7 @@
> #include <linux/fcntl.h>
>
> /* Flags for pidfd_open(). */
> -#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_THREAD O_EXCL
>
> #endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index dfb963d2f862..493647fd7c07 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> kill_orphaned_pgrp(tsk->group_leader, NULL);
>
> tsk->exit_state = EXIT_ZOMBIE;
> + /*
> + * sub-thread or delay_group_leader(), wake up the
> + * PIDFD_THREAD waiters.
> + */
> + if (!thread_group_empty(tsk))
> + do_notify_pidfd(tsk);
> +
> if (unlikely(tsk->ptrace)) {
> int sig = thread_group_leader(tsk) &&
> thread_group_empty(tsk) &&
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 347641398f9d..bea32071fff2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -101,6 +101,7 @@
> #include <linux/user_events.h>
> #include <linux/iommu.h>
> #include <linux/rseq.h>
> +#include <uapi/linux/pidfd.h>
>
> #include <asm/pgalloc.h>
> #include <linux/uaccess.h>
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>
> seq_put_decimal_ll(m, "Pid:\t", nr);
>
> + /* TODO: report PIDFD_THREAD */
> +
> #ifdef CONFIG_PID_NS
> seq_put_decimal_ll(m, "\nNSpid:\t", nr);
> if (nr > 0) {
> @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
> }
> #endif
>
> +static bool pidfd_task_exited(struct pid *pid, bool thread)
> +{
> + struct task_struct *task;
> + bool exited;
> +
> + rcu_read_lock();
> + task = pid_task(pid, PIDTYPE_PID);
> + exited = !task ||
> + (READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
> + rcu_read_unlock();
> +
> + return exited;
> +}
> +
> /*
> * Poll support for process exit notification.
> */
> static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> {
> struct pid *pid = file->private_data;
> + bool thread = file->f_flags & PIDFD_THREAD;
> __poll_t poll_flags = 0;
>
> poll_wait(file, &pid->wait_pidfd, pts);
> -
> /*
> - * Inform pollers only when the whole thread group exits.
> - * If the thread group leader exits before all other threads in the
> - * group, then poll(2) should block, similar to the wait(2) family.
> + * Depending on PIDFD_THREAD, inform pollers when the thread
> + * or the whole thread-group exits.
> */
> - if (thread_group_exited(pid))
> + if (pidfd_task_exited(pid, thread))
> poll_flags = EPOLLIN | EPOLLRDNORM;
>
> return poll_flags;
> @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> return PTR_ERR(pidfd_file);
> }
> get_pid(pid); /* held by pidfd_file now */
> + /*
> + * anon_inode_getfile() ignores everything outside of the
> + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
> + */
> + pidfd_file->f_flags |= (flags & PIDFD_THREAD);
> *ret = pidfd_file;
> return pidfd;
> }
> @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> * Allocate a new file that stashes @pid and reserve a new pidfd number in the
> * caller's file descriptor table. The pidfd is reserved but not installed yet.
> *
> - * The helper verifies that @pid is used as a thread group leader.
> + * The helper verifies that @pid is still in use, without PIDFD_THREAD the
> + * task identified by @pid must be a thread-group leader.
> *
> * If this function returns successfully the caller is responsible to either
> * call fd_install() passing the returned pidfd and pidfd file as arguments in
> @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> */
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> {
> - if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> + bool thread = flags & PIDFD_THREAD;
> +
> + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
> return -EINVAL;
>
> return __pidfd_prepare(pid, flags, ret);
> diff --git a/kernel/pid.c b/kernel/pid.c
> index c7a3e359f8f5..e11144466828 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
> * Return the task associated with @pidfd. The function takes a reference on
> * the returned task. The caller is responsible for releasing that reference.
> *
> - * Currently, the process identified by @pidfd is always a thread-group leader.
> - * This restriction currently exists for all aspects of pidfds including pidfd
> - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
> - * (only supports thread group leaders).
> - *
> * Return: On success, the task_struct associated with the pidfd.
> * On error, a negative errno number will be returned.
> */
> @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
> * @flags: flags to pass
> *
> * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> - * the process identified by @pid. Currently, the process identified by
> - * @pid must be a thread-group leader. This restriction currently exists
> - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> - * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> - * leaders).
> + * the task identified by @pid. Without PIDFD_THREAD flag the target task
> + * must be a thread-group leader.
> *
> * Return: On success, a cloexec pidfd is returned.
> * On error, a negative errno number will be returned.
> @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> int fd;
> struct pid *p;
>
> - if (flags & ~PIDFD_NONBLOCK)
> + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
> return -EINVAL;
>
> if (pid <= 0)
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9561a3962ca6..5f48d2c4b409 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
> WARN_ON_ONCE(!tsk->ptrace &&
> (tsk->group_leader != tsk || !thread_group_empty(tsk)));
> /*
> - * tsk is a group leader and has no threads, wake up the pidfd waiters.
> + * tsk is a group leader and has no threads, wake up the
> + * non-PIDFD_THREAD waiters.
> */
> if (thread_group_empty(tsk))
> do_notify_pidfd(tsk);
> @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> prepare_kill_siginfo(sig, &kinfo);
> }
>
> + /* TODO: respect PIDFD_THREAD */
> ret = kill_pid_info(sig, &kinfo, pid);
>
> err:
> --
> 2.25.1.362.g51ebf55
>