[PATCH v2 3/3] prlimit: do not grab the tasklist_lock

From: Barret Rhoden
Date: Wed Jan 05 2022 - 16:31:48 EST


Unnecessarily grabbing the tasklist_lock can be a scalability bottleneck
for workloads that also must grab the tasklist_lock for waiting,
killing, and cloning.

The tasklist_lock was grabbed to protect tsk->sighand from disappearing
(becoming NULL). tsk->signal was already protected by holding a
reference to tsk.

update_rlimit_cpu() assumed tsk->sighand != NULL. With this commit, it
attempts to lock_task_sighand(). However, this means that
update_rlimit_cpu() can fail. This only happens when a task is exiting.
Note that during exec, sighand may *change*, but it will not be NULL.

Prior to this commit, the do_prlimit() ensured that update_rlimit_cpu()
would not fail by read locking the tasklist_lock and checking tsk->sighand
!= NULL.

If update_rlimit_cpu() fails, there may be other tasks that are not
exiting that share tsk->signal. We need to run update_rlimit_cpu() on
one of them. We can't "back out" the new rlim - once we unlocked
task_lock(group_leader), the rlim is essentially changed.

The only other caller of update_rlimit_cpu() is
selinux_bprm_committing_creds(). It has tsk == current, so
update_rlimit_cpu() cannot fail (current->sighand cannot disappear
until current exits).

This change resulted in a 14% speedup on a microbenchmark where parents
kill and wait on their children, and children getpriority, setpriority,
and getrlimit.

Signed-off-by: Barret Rhoden <brho@xxxxxxxxxx>
---
include/linux/posix-timers.h | 2 +-
kernel/sys.c | 32 +++++++++++++++++++++-----------
kernel/time/posix-cpu-timers.c | 12 +++++++++---
3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 5bbcd280bfd2..9cf126c3b27f 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -253,7 +253,7 @@ void posix_cpu_timers_exit_group(struct task_struct *task);
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
u64 *newval, u64 *oldval);

-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);

void posixtimer_rearm(struct kernel_siginfo *info);
#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index fb2a5e7c0589..073ae9db192f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1432,13 +1432,7 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
return -EPERM;
}

- /* protect tsk->signal and tsk->sighand from disappearing */
- read_lock(&tasklist_lock);
- if (!tsk->sighand) {
- retval = -ESRCH;
- goto out;
- }
-
+ /* Holding a refcount on tsk protects tsk->signal from disappearing. */
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
@@ -1467,10 +1461,26 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
- IS_ENABLED(CONFIG_POSIX_TIMERS))
- update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
- read_unlock(&tasklist_lock);
+ IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+ if (update_rlimit_cpu(tsk, new_rlim->rlim_cur)) {
+ /*
+ * update_rlimit_cpu can fail if the task is exiting.
+ * We already set the task group's rlim, so we need to
+ * update_rlimit_cpu for some other task in the process.
+ * If all of the tasks are exiting, then we don't need
+ * to update_rlimit_cpu.
+ */
+ struct task_struct *t_i;
+
+ rcu_read_lock();
+ for_each_thread(tsk, t_i) {
+ if (!update_rlimit_cpu(t_i, new_rlim->rlim_cur))
+ break;
+ }
+ rcu_read_unlock();
+ }
+ }
+
return retval;
}

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..e13e628509fb 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
* necessary. Needs siglock protection since other code may update the
* expiration cache as well.
+ *
+ * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
+ * we cannot lock_task_sighand. Cannot fail if task is current.
*/
-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
u64 nsecs = rlim_new * NSEC_PER_SEC;
+ unsigned long irq_fl;

- spin_lock_irq(&task->sighand->siglock);
+ if (!lock_task_sighand(task, &irq_fl))
+ return -ESRCH;
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
- spin_unlock_irq(&task->sighand->siglock);
+ unlock_task_sighand(task, &irq_fl);
+ return 0;
}

/*
--
2.34.1.448.ga2b2bfdf31-goog