Re: [PATCH] sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil

From: Tejun Heo
Date: Wed Jul 27 2022 - 16:09:14 EST


Hello,

On Wed, Jul 20, 2022 at 03:45:27PM +0800, Xuewen Yan wrote:
> Dear all
>
> On Wed, Jul 13, 2022 at 11:20 AM Tejun Heo <tj@xxxxxxxxxx> wrote:
> >
> > On Tue, Jul 12, 2022 at 10:49:57PM -0400, Waiman Long wrote:
> > > > Well, the only thing I can think of is always grabbing cpus_read_lock()
> > > > before grabbing threadgroup_rwsem. Waiman, what do you think?
> > >
> > > That is a possible solution as cpus_read_lock() is rather lightweight. It is
> > > a good practice to acquire it first.
> >
> > On a similar note, I think we probably should re-enable percpu operations on
> > threadgroup_rwsem too by default and allow users who are affected by slower
> > write path operations to opt-in. After the new CLONE_INTO_CGROUP which
> > doesn't need the rwsem, we have far fewer write lockers after all.
> >
>
> If there's any patch for me to try? I would be very grateful.

Can youp please see whether the following patch fixes the problem?

Thanks.

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 13c8e91d78620..7caefc8af9127 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);

+/**
+ * lock_threadgroup - Stabilize threadgroups
+ *
+ * cgroup migration operations need the threadgroups stabilized against forks
+ * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem.
+ *
+ * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug
+ * disable. This is because cpuset needs CPU hotplug disabled during ->attach()
+ * and bringing up a CPU may involve creating new tasks which can require
+ * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with
+ * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by
+ * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting
+ * for the threadgroup_rwsem to be released to create new tasks. See the
+ * following for more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ */
+static void lock_threadgroup(void)
+{
+ cpus_read_lock();
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * lock_threadgroup - Undo lock_threadgroup
+ */
+static void unlock_threadgroup(void)
+{
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
@@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,

struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
pid_t pid;
@@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
lockdep_assert_held(&cgroup_mutex);
if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ lock_threadgroup();
*locked = true;
} else {
*locked = false;
@@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,

out_unlock_threadgroup:
if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ unlock_threadgroup();
*locked = false;
}
out_unlock_rcu:
@@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
}

void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
put_task_struct(task);

if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ unlock_threadgroup();
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)

lockdep_assert_held(&cgroup_mutex);

- percpu_down_write(&cgroup_threadgroup_rwsem);
+ lock_threadgroup();

/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
@@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ unlock_threadgroup();
return ret;
}