[PATCH 2/6] sched/cpuset: Bring back cpuset_mutex

From: Qais Yousef
Date: Sun Aug 20 2023 - 11:25:53 EST


From: Juri Lelli <juri.lelli@xxxxxxxxxx>

commit 111cd11bbc54850f24191c52ff217da88a5e639b upstream.

Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset:
Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea,
as it has been reported to cause slowdowns in workloads that need to
change cpuset configuration frequently and it is also not implementing
priority inheritance (which causes troubles with realtime workloads).

Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it
only for SCHED_DEADLINE tasks (other policies don't care about stable
cpusets anyway).

Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx>
Reviewed-by: Waiman Long <longman@xxxxxxxxxx>
Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
(cherry picked from commit 111cd11bbc54850f24191c52ff217da88a5e639b)
[Fix conflict in kernel/cgroup/cpuset.c due to pulling new functions or
comment that don't exist on 5.10 or the usage of different cpu hotplug
lock whenever replacing the rwsem with mutex. Remove BUG_ON() for rwsem
that doesn't exist on mainline.]
Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx>
---
include/linux/cpuset.h | 8 ++---
kernel/cgroup/cpuset.c | 78 ++++++++++++++++++++----------------------
kernel/sched/core.c | 22 ++++++++----
3 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 04c20de66afc..3261a45f97d1 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -55,8 +55,8 @@ extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_wait_for_hotplug(void);
-extern void cpuset_read_lock(void);
-extern void cpuset_read_unlock(void);
+extern void cpuset_lock(void);
+extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -178,8 +178,8 @@ static inline void cpuset_update_active_cpus(void)

static inline void cpuset_wait_for_hotplug(void) { }

-static inline void cpuset_read_lock(void) { }
-static inline void cpuset_read_unlock(void) { }
+static inline void cpuset_lock(void) { }
+static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 96b56226a304..8f31ed881938 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -334,16 +334,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/

-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+static DEFINE_MUTEX(cpuset_mutex);

-void cpuset_read_lock(void)
+void cpuset_lock(void)
{
- percpu_down_read(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}

-void cpuset_read_unlock(void)
+void cpuset_unlock(void)
{
- percpu_up_read(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}

static DEFINE_SPINLOCK(callback_lock);
@@ -930,7 +930,7 @@ static void dl_rebuild_rd_accounting(void)
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;

- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);

@@ -991,7 +991,7 @@ static void rebuild_sched_domains_locked(void)
int ndoms;

lockdep_assert_cpus_held();
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);

/*
* If we have raced with CPU hotplug, return early to avoid
@@ -1042,9 +1042,9 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
}

@@ -1160,7 +1160,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
int new_prs;
bool part_error = false; /* Partition error? */

- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);

/*
* The parent must be a partition root.
@@ -1490,7 +1490,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;

- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);

/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2157,7 +2157,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);

- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
@@ -2181,7 +2181,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cs->attach_in_progress++;
ret = 0;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}

@@ -2193,11 +2193,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);

- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}

/*
@@ -2221,7 +2221,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cs = css_cs(css);

lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

/* prepare for attach */
if (cs == &top_cpuset)
@@ -2275,7 +2275,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);

- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}

/* The various types of files and directories in a cpuset file system */
@@ -2307,7 +2307,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = 0;

get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2343,7 +2343,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
return retval;
}
@@ -2356,7 +2356,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = -ENODEV;

get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;

@@ -2369,7 +2369,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
return retval;
}
@@ -2410,7 +2410,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
flush_work(&cpuset_hotplug_work);

get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;

@@ -2434,7 +2434,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,

free_cpuset(trialcs);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
@@ -2567,13 +2567,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,

css_get(&cs->css);
get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;

retval = update_prstate(cs, val);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
css_put(&cs->css);
return retval ?: nbytes;
@@ -2781,7 +2781,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
return 0;

get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -2832,7 +2832,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
return 0;
}
@@ -2853,7 +2853,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
struct cpuset *cs = css_cs(css);

get_online_cpus();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

if (is_partition_root(cs))
update_prstate(cs, 0);
@@ -2872,7 +2872,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);

- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
put_online_cpus();
}

@@ -2885,7 +2885,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);

if (is_in_v2_mode()) {
@@ -2898,7 +2898,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}

spin_unlock_irq(&callback_lock);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}

/*
@@ -2940,8 +2940,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {

int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -3013,7 +3011,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);

- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);

/*
* Move tasks to the nearest ancestor with execution resources,
@@ -3023,7 +3021,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
if (is_empty)
remove_tasks_in_empty_cpuset(cs);

- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}

static void
@@ -3073,14 +3071,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
goto retry;
}

@@ -3152,7 +3150,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);

- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}

/**
@@ -3182,7 +3180,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;

- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);

/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3239,7 +3237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}

- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);

/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d6dd14cfd26..906289d2ed68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5221,6 +5221,7 @@ static int __sched_setscheduler(struct task_struct *p,
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
+ bool cpuset_locked = false;

/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -5318,8 +5319,14 @@ static int __sched_setscheduler(struct task_struct *p,
return retval;
}

- if (pi)
- cpuset_read_lock();
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }

/*
* Make sure no PI-waiters arrive (or leave) while we are
@@ -5395,8 +5402,8 @@ static int __sched_setscheduler(struct task_struct *p,
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
goto recheck;
}

@@ -5462,7 +5469,8 @@ static int __sched_setscheduler(struct task_struct *p,
task_rq_unlock(rq, p, &rf);

if (pi) {
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
rt_mutex_adjust_pi(p);
}

@@ -5474,8 +5482,8 @@ static int __sched_setscheduler(struct task_struct *p,

unlock:
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
return retval;
}

--
2.34.1