Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

From: Qais Yousef
Date: Sun Mar 03 2024 - 12:44:30 EST


On 02/27/24 10:42, Dietmar Eggemann wrote:
> On 20/02/2024 23:56, Qais Yousef wrote:
> > If a misfit task is affined to a subset of the possible cpus, we need to
> > verify that one of these cpus can fit it. Otherwise the load balancer
> > code will continuously trigger needlessly leading the balance_interval
> > to increase in return and eventually end up with a situation where real
> > imbalances take a long time to address because of this impossible
> > imbalance situation.
> >
> > This can happen in Android world where it's common for background tasks
> > to be restricted to little cores.
> >
> > Similarly if we can't fit the biggest core, triggering misfit is
> > pointless as it is the best we can ever get on this system.
> >
> > To be able to detect that; we use asym_cap_list to iterate through
> > capacities in the system to see if the task is able to run at a higher
> > capacity level based on its p->cpus_ptr. We do that when the affinity
> > change, a fair task is forked, or when a task switched to fair policy.
> > We store the max_allowed_capacity in task_struct to allow for cheap
> > comparison in the fast path.
> >
> > Improve check_misfit_status() function by removing redundant checks.
> > misfit_task_load will be 0 if the task can't move to a bigger CPU. And
> > nohz_load_balance() already checks for cpu_check_capacity() before
>
> s/nohz_load_balance()/nohz_balancer_kick() ?

Yes.

>
> > calling check_misfit_status().
>
> Isn't there an issue with CPU hotplug.

Sigh, yes. Thanks for catching it.

This should fix it, if you're willing to give a try it before I post v8, that'd
be appreciated. Otherwise I'll send v8 later in the week.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 174687252e1a..b0e60a565945 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
cpumask_t *cpumask;

cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(cpu_active_mask, cpumask))
+ continue;
if (!cpumask_intersects(p->cpus_ptr, cpumask))
continue;

@@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
rcu_read_unlock();
}

+static void __update_tasks_max_allowed_capacity(unsigned long capacity)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p) {
+ if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
+ set_task_max_allowed_capacity(p);
+ }
+}
+
+/*
+ * Handle a cpu going online/offline changing the available capacity levels.
+ */
+static void update_tasks_max_allowed_capacity(int cpu, bool online)
+{
+ struct asym_cap_data *entry;
+ bool do_update = false;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ if (cpuhp_tasks_frozen)
+ return;
+
+ rcu_read_lock();
+ /* Did a capacity level appear/disappear? */
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ unsigned int nr_active;
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+
+ if (!cpumask_test_cpu(cpu, cpumask))
+ continue;
+
+ nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
+ if (online)
+ do_update = nr_active == 1;
+ else
+ do_update = !nr_active;
+ break;
+ }
+ if (do_update)
+ __update_tasks_max_allowed_capacity(entry->capacity);
+ rcu_read_unlock();
+}
+
static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
{
set_cpus_allowed_common(p, ctx);
@@ -12500,6 +12549,8 @@ static void rq_online_fair(struct rq *rq)
update_sysctl();

update_runtime_enabled(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), true);
}

static void rq_offline_fair(struct rq *rq)
@@ -12511,6 +12562,8 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), false);
}

#endif /* CONFIG_SMP */
--
2.34.1


>
> On a tri-geared Juno:
>
> root@juno:~# cat /sys/devices/system/cpu/cpu*/cpu_capacity
> 513
> 1024
> 1024
> 513
> 256
> 256
>
> root@juno:~# taskset -pc 0,3-5 $$
>
> [ 108.248425] set_task_max_allowed_capacity() [bash 1636]
> max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5
>
> echo 0 > /sys//devices/system/cpu/cpu0/online
> echo 0 > /sys//devices/system/cpu/cpu3/online
>
> [ 134.136887] set_task_max_allowed_capacity() [bash 1639]
> max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5
>
>
> Cpuset seems to be fine since it set task's cpumask.
>
> [...]
>
> > +/*
> > + * Check the max capacity the task is allowed to run at for misfit detection.
>
> Nitpick: It's rather a setter function so s/check/set here ?
>
> > + */
> > +static void set_task_max_allowed_capacity(struct task_struct *p)
> > +{
> > + struct asym_cap_data *entry;
> > +
> > + if (!sched_asym_cpucap_active())
> > + return;
> > +
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> > + cpumask_t *cpumask;
> > +
> > + cpumask = cpu_capacity_span(entry);
> > + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> > + continue;
> > +
> > + p->max_allowed_capacity = entry->capacity;
> > + break;
> > + }
> > + rcu_read_unlock();
> > +}
>
> [...]
>
> > @@ -9601,16 +9644,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> > (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
> > }
> >
> > -/*
> > - * Check whether a rq has a misfit task and if it looks like we can actually
> > - * help that task: we can migrate the task to a CPU of higher capacity, or
> > - * the task's current CPU is heavily pressured.
> > - */
> > -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> > +/* Check if the rq has a misfit task */
> > +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
>
> `struct sched_domain *sd` is not needed anymore.
>
> Since there is only 1 user of check_misfit_status() you might remove it
> entirely and use `rq->rq->misfit_task_load` directly in
> nohz_balancer_kick() ?

I think it's better to keep the access encapsulated.

I have this fixup diff

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 593e85f90a36..1ac7dc8784b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8246,7 +8246,7 @@ static void task_dead_fair(struct task_struct *p)
}

/*
- * Check the max capacity the task is allowed to run at for misfit detection.
+ * Set the max capacity the task is allowed to run at for misfit detection.
*/
static void set_task_max_allowed_capacity(struct task_struct *p)
{
@@ -9638,7 +9638,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
}

/* Check if the rq has a misfit task */
-static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
+static inline bool check_misfit_status(struct rq *rq)
{
return rq->misfit_task_load;
}
@@ -11952,7 +11952,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}


Thanks!

--
Qais Yousef