Re: [PATCH 2/3] fs/proc: do_task_stat: use sig->stats_lock to gather the threads/children stats

From: Dylan Hatch
Date: Tue Jan 23 2024 - 18:50:49 EST


On Tue, Jan 23, 2024 at 7:35 AM Oleg Nesterov <oleg@xxxxxxxxxx> wrote:
>
> lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call
> do_task_stat() at the same time and the process has NR_THREADS, it will
> spin with irqs disabled O(NR_CPUS * NR_THREADS) time.
>
> Change do_task_stat() to use sig->stats_lock to gather the statistics
> outside of ->siglock protected section, in the likely case this code
> will run lockless.
>
> Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
> ---
> fs/proc/array.c | 58 +++++++++++++++++++++++++++----------------------
> 1 file changed, 32 insertions(+), 26 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 45ba91863808..34a47fb0c57f 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> int permitted;
> struct mm_struct *mm;
> unsigned long long start_time;
> - unsigned long cmin_flt = 0, cmaj_flt = 0;
> - unsigned long min_flt = 0, maj_flt = 0;
> - u64 cutime, cstime, utime, stime;
> - u64 cgtime, gtime;
> + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
> + u64 cutime, cstime, cgtime, utime, stime, gtime;
> unsigned long rsslim = 0;
> unsigned long flags;
> int exit_code = task->exit_code;
> + struct signal_struct *sig = task->signal;
> + unsigned int seq = 1;
>
> state = *get_task_state(task);
> vsize = eip = esp = 0;
> @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>
> sigemptyset(&sigign);
> sigemptyset(&sigcatch);
> - cutime = cstime = 0;
> - cgtime = gtime = 0;
>
> if (lock_task_sighand(task, &flags)) {
> - struct signal_struct *sig = task->signal;
> -
> if (sig->tty) {
> struct pid *pgrp = tty_get_pgrp(sig->tty);
> tty_pgrp = pid_nr_ns(pgrp, ns);
> @@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> num_threads = get_nr_threads(task);
> collect_sigign_sigcatch(task, &sigign, &sigcatch);
>
> - cmin_flt = sig->cmin_flt;
> - cmaj_flt = sig->cmaj_flt;
> - cutime = sig->cutime;
> - cstime = sig->cstime;
> - cgtime = sig->cgtime;
> rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
>
> - /* add up live thread stats at the group level */
> if (whole) {
> - struct task_struct *t;
> -
> - __for_each_thread(sig, t) {
> - min_flt += t->min_flt;
> - maj_flt += t->maj_flt;
> - gtime += task_gtime(t);
> - }
> -
> - min_flt += sig->min_flt;
> - maj_flt += sig->maj_flt;
> - gtime += sig->gtime;
> -
> if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
> exit_code = sig->group_exit_code;
> }
> @@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> if (permitted && (!whole || num_threads < 2))
> wchan = !task_is_running(task);
>
> + do {
> + seq++; /* 2 on the 1st/lockless path, otherwise odd */
> + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
> +
> + cmin_flt = sig->cmin_flt;
> + cmaj_flt = sig->cmaj_flt;
> + cutime = sig->cutime;
> + cstime = sig->cstime;
> + cgtime = sig->cgtime;
> +
> + if (whole) {
> + struct task_struct *t;
> +
> + min_flt = sig->min_flt;
> + maj_flt = sig->maj_flt;
> + gtime = sig->gtime;
> +
> + rcu_read_lock();
> + __for_each_thread(sig, t) {
> + min_flt += t->min_flt;
> + maj_flt += t->maj_flt;
> + gtime += task_gtime(t);
> + }
> + rcu_read_unlock();
> + }
> + } while (need_seqretry(&sig->stats_lock, seq));
> + done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
> +
> if (whole) {
> thread_group_cputime_adjusted(task, &utime, &stime);
> } else {
> --
> 2.25.1.362.g51ebf55
>

Signed-off-by: Dylan Hatch <dylanbhatch@xxxxxxxxxx>