Re: [PATCHv8] exec: Fix dead-lock in de_thread with ptrace_attach

From: Bernd Edlinger
Date: Fri Jun 11 2021 - 03:54:56 EST


On 6/10/21 9:31 AM, Bernd Edlinger wrote:
> This introduces signal->unsafe_execve_in_progress,
> which is used to fix the case when at least one of the
> sibling threads is traced, and therefore the trace
> process may dead-lock in ptrace_attach, but de_thread
> will need to wait for the tracer to continue execution.
>
> The solution is to detect this situation and allow
> ptrace_attach to continue, while de_thread() is still
> waiting for traced zombies to be eventually released.
> When the current thread changed the ptrace status from
> non-traced to traced, we can simply abort the whole
> execve and restart it by returning -ERESTARTSYS.
> This needs to be done before changing the thread leader,
> because the PTRACE_EVENT_EXEC needs to know the old
> thread pid.
>
> Although it is technically after the point of no return,
> we just have to reset bprm->point_of_no_return here,
> since at this time only the other threads have received
> a fatal signal, not the current thread.
>
> From the user's point of view the whole execve was
> simply delayed until after the ptrace_attach.
>
> Other threads die quickly since the cred_guard_mutex
> is released, but a deadly signal is already pending.
> In case the mutex_lock_killable misses the signal,
> ->unsafe_execve_in_progress makes sure they release
> the mutex immediately and return with -ERESTARTNOINTR.
>
> This means there is no API change, unlike the previous
> version of this patch which was discussed here:
>
> https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@xxxxxxxxxx/
>
> See tools/testing/selftests/ptrace/vmaccess.c
> for a test case that gets fixed by this change.
>
> Note that since the test case was originally designed to
> test the ptrace_attach returning an error in this situation,
> the test expectation needed to be adjusted, to allow the
> API to succeed at the first attempt.
>
> Signed-off-by: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx>
> ---
> fs/exec.c | 45 ++++++++++++++++++++++++++-----
> fs/proc/base.c | 6 +++++
> include/linux/sched/signal.h | 13 +++++++++
> kernel/ptrace.c | 9 +++++++
> kernel/seccomp.c | 12 ++++++---
> tools/testing/selftests/ptrace/vmaccess.c | 25 +++++++++++------
> 6 files changed, 92 insertions(+), 18 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 8344fba..ac3fec1 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1040,6 +1040,8 @@ static int de_thread(struct task_struct *tsk)
> struct signal_struct *sig = tsk->signal;
> struct sighand_struct *oldsighand = tsk->sighand;
> spinlock_t *lock = &oldsighand->siglock;
> + unsigned int prev_ptrace = tsk->ptrace;
> + struct task_struct *t = tsk;
>
> if (thread_group_empty(tsk))
> goto no_thread_group;
> @@ -1057,20 +1059,40 @@ static int de_thread(struct task_struct *tsk)
> return -EAGAIN;
> }
>
> + while_each_thread(tsk, t) {
> + if (unlikely(t->ptrace) && t != tsk->group_leader)
> + sig->unsafe_execve_in_progress = true;
> + }
> +
> sig->group_exit_task = tsk;
> sig->notify_count = zap_other_threads(tsk);
> if (!thread_group_leader(tsk))
> sig->notify_count--;
> + spin_unlock_irq(lock);
>
> - while (sig->notify_count) {
> - __set_current_state(TASK_KILLABLE);
> - spin_unlock_irq(lock);
> + if (unlikely(sig->unsafe_execve_in_progress))
> + mutex_unlock(&sig->cred_guard_mutex);
> +
> + for (;;) {
> + set_current_state(TASK_KILLABLE);
> + if (!sig->notify_count)
> + break;
> schedule();
> if (__fatal_signal_pending(tsk))
> goto killed;
> - spin_lock_irq(lock);
> }
> - spin_unlock_irq(lock);
> + __set_current_state(TASK_RUNNING);

Oh, sorry, I think I'll need to keep this spin_lock here,
because otherwise the assignment sig->group_exit_task = NULL
below will race with

kernel/exit.c (__exit_signal):
if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);

which runs under spin_lock(&sighand->siglock) and tasklist_lock write-locked.

Will send an updaten the patch later today.


Bernd.

> +
> + if (unlikely(sig->unsafe_execve_in_progress)) {
> + if (mutex_lock_killable(&sig->cred_guard_mutex))
> + goto killed;
> + sig->unsafe_execve_in_progress = false;
> + if (!prev_ptrace && tsk->ptrace) {
> + sig->group_exit_task = NULL;
> + sig->notify_count = 0;
> + return -ERESTARTSYS;
> + }
> + }
>
> /*
> * At this point all other threads have exited, all we have to
> @@ -1255,8 +1277,11 @@ int begin_new_exec(struct linux_binprm * bprm)
> * Make this the only thread in the thread group.
> */
> retval = de_thread(me);
> - if (retval)
> + if (retval) {
> + if (retval == -ERESTARTSYS)
> + bprm->point_of_no_return = false;
> goto out;
> + }
>
> /*
> * Cancel any io_uring activity across execve
> @@ -1466,6 +1491,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
> if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
> return -ERESTARTNOINTR;
>
> + if (unlikely(current->signal->unsafe_execve_in_progress)) {
> + mutex_unlock(&current->signal->cred_guard_mutex);
> + return -ERESTARTNOINTR;
> + }
> +
> bprm->cred = prepare_exec_creds();
> if (likely(bprm->cred))
> return 0;
> @@ -1482,7 +1512,8 @@ static void free_bprm(struct linux_binprm *bprm)
> }
> free_arg_pages(bprm);
> if (bprm->cred) {
> - mutex_unlock(&current->signal->cred_guard_mutex);
> + if (!current->signal->unsafe_execve_in_progress)
> + mutex_unlock(&current->signal->cred_guard_mutex);
> abort_creds(bprm->cred);
> }
> if (bprm->file) {
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 3851bfc..3b2a55c 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2739,6 +2739,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
> if (rv < 0)
> goto out_free;
>
> + if (unlikely(current->signal->unsafe_execve_in_progress)) {
> + mutex_unlock(&current->signal->cred_guard_mutex);
> + rv = -ERESTARTNOINTR;
> + goto out_free;
> + }
> +
> rv = security_setprocattr(PROC_I(inode)->op.lsm,
> file->f_path.dentry->d_name.name, page,
> count);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3f6a0fc..220a083 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -214,6 +214,17 @@ struct signal_struct {
> #endif
>
> /*
> + * Set while execve is executing but is *not* holding
> + * cred_guard_mutex to avoid possible dead-locks.
> + * The cred_guard_mutex is released *after* de_thread() has
> + * called zap_other_threads(), therefore a fatal signal is
> + * guaranteed to be already pending in the unlikely event, that
> + * current->signal->unsafe_execve_in_progress happens to be
> + * true after the cred_guard_mutex was acquired.
> + */
> + bool unsafe_execve_in_progress;
> +
> + /*
> * Thread is the potential origin of an oom condition; kill first on
> * oom
> */
> @@ -227,6 +238,8 @@ struct signal_struct {
> struct mutex cred_guard_mutex; /* guard against foreign influences on
> * credential calculations
> * (notably. ptrace)
> + * Held while execve runs, except when
> + * a sibling thread is being traced.
> * Deprecated do not use in new code.
> * Use exec_update_lock instead.
> */
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 61db50f..0cbc1eb 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -468,6 +468,14 @@ static int ptrace_traceme(void)
> {
> int ret = -EPERM;
>
> + if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
> + return -ERESTARTNOINTR;
> +
> + if (unlikely(current->signal->unsafe_execve_in_progress)) {
> + mutex_unlock(&current->signal->cred_guard_mutex);
> + return -ERESTARTNOINTR;
> + }
> +
> write_lock_irq(&tasklist_lock);
> /* Are we already being traced? */
> if (!current->ptrace) {
> @@ -483,6 +491,7 @@ static int ptrace_traceme(void)
> }
> }
> write_unlock_irq(&tasklist_lock);
> + mutex_unlock(&current->signal->cred_guard_mutex);
>
> return ret;
> }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 1d60fc2..b1389ee 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1824,9 +1824,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
> * Make sure we cannot change seccomp or nnp state via TSYNC
> * while another thread is in the middle of calling exec.
> */
> - if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
> - mutex_lock_killable(&current->signal->cred_guard_mutex))
> - goto out_put_fd;
> + if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
> + if (mutex_lock_killable(&current->signal->cred_guard_mutex))
> + goto out_put_fd;
> +
> + if (unlikely(current->signal->unsafe_execve_in_progress)) {
> + mutex_unlock(&current->signal->cred_guard_mutex);
> + goto out_put_fd;
> + }
> + }
>
> spin_lock_irq(&current->sighand->siglock);
>
> diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
> index 4db327b..c7c2242 100644
> --- a/tools/testing/selftests/ptrace/vmaccess.c
> +++ b/tools/testing/selftests/ptrace/vmaccess.c
> @@ -39,8 +39,15 @@ static void *thread(void *arg)
> f = open(mm, O_RDONLY);
> ASSERT_GE(f, 0);
> close(f);
> - f = kill(pid, SIGCONT);
> - ASSERT_EQ(f, 0);
> + f = waitpid(-1, NULL, 0);
> + ASSERT_NE(f, -1);
> + ASSERT_NE(f, 0);
> + ASSERT_NE(f, pid);
> + f = waitpid(-1, NULL, 0);
> + ASSERT_EQ(f, pid);
> + f = waitpid(-1, NULL, 0);
> + ASSERT_EQ(f, -1);
> + ASSERT_EQ(errno, ECHILD);
> }
>
> TEST(attach)
> @@ -57,22 +64,24 @@ static void *thread(void *arg)
>
> sleep(1);
> k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
> - ASSERT_EQ(errno, EAGAIN);
> - ASSERT_EQ(k, -1);
> + ASSERT_EQ(k, 0);
> k = waitpid(-1, &s, WNOHANG);
> ASSERT_NE(k, -1);
> ASSERT_NE(k, 0);
> ASSERT_NE(k, pid);
> ASSERT_EQ(WIFEXITED(s), 1);
> ASSERT_EQ(WEXITSTATUS(s), 0);
> - sleep(1);
> - k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
> - ASSERT_EQ(k, 0);
> k = waitpid(-1, &s, 0);
> ASSERT_EQ(k, pid);
> ASSERT_EQ(WIFSTOPPED(s), 1);
> ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
> - k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
> + k = ptrace(PTRACE_CONT, pid, 0L, 0L);
> + ASSERT_EQ(k, 0);
> + k = waitpid(-1, &s, 0);
> + ASSERT_EQ(k, pid);
> + ASSERT_EQ(WIFSTOPPED(s), 1);
> + ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
> + k = ptrace(PTRACE_CONT, pid, 0L, 0L);
> ASSERT_EQ(k, 0);
> k = waitpid(-1, &s, 0);
> ASSERT_EQ(k, pid);
>