Re: [PATCH v3 0/2] sched: Minor changes for rd->overload access

From: Ingo Molnar
Date: Thu Mar 28 2024 - 07:08:12 EST



* Ingo Molnar <mingo@xxxxxxxxxx> wrote:

> Plus I've applied a patch to rename ::overload to ::overloaded. It is
> silly to use an ambiguous noun instead of a clear adjective when naming
> such a flag ...

Plus SG_OVERLOAD should be SG_OVERLOADED as well - it now looks in line
with SG_OVERUTILIZED:

/* Scheduling group status flags */
#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */
#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */

My followup question is: why are these a bitmask, why not separate
flags?

AFAICS we only ever set them separately:

thule:~/tip> git grep SG_OVER kernel/sched/
kernel/sched/fair.c: set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
kernel/sched/fair.c: *sg_status |= SG_OVERLOADED;
kernel/sched/fair.c: *sg_status |= SG_OVERUTILIZED;
kernel/sched/fair.c: *sg_status |= SG_OVERLOADED;
kernel/sched/fair.c: set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED);
kernel/sched/fair.c: sg_status & SG_OVERUTILIZED);
kernel/sched/fair.c: } else if (sg_status & SG_OVERUTILIZED) {
kernel/sched/fair.c: set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
kernel/sched/sched.h:#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */
kernel/sched/sched.h:#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
kernel/sched/sched.h: set_rd_overloaded(rq->rd, SG_OVERLOADED);

In fact this results in suboptimal code:

/* update overload indicator if we are at root domain */
set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED);

/* Update over-utilization (tipping point, U >= 0) indicator */
set_rd_overutilized_status(env->dst_rq->rd,
sg_status & SG_OVERUTILIZED);

Note how the bits that got mixed together in sg_status now have to be
masked out individually.

The sg_status bitmask appears to make no sense at all to me.

By turning these into individual bool flags we could also do away with
all the extra SG_OVERLOADED/SG_OVERUTILIZED abstraction.

Ie. something like the patch below? Untested.

Thanks,

Ingo

=================>
From: Ingo Molnar <mingo@xxxxxxxxxx>
Date: Thu, 28 Mar 2024 12:00:14 +0100
Subject: [PATCH] sched/balancing: Simplify the sg_status bitmask and use separate ->overloaded and ->overutilized flags

SG_OVERLOADED and SG_OVERUTILIZED flags plus the sg_status bitmask are an
unnecessary complication that only make the code harder to read and slower.

We only ever set them separately:

thule:~/tip> git grep SG_OVER kernel/sched/
kernel/sched/fair.c: set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
kernel/sched/fair.c: *sg_status |= SG_OVERLOADED;
kernel/sched/fair.c: *sg_status |= SG_OVERUTILIZED;
kernel/sched/fair.c: *sg_status |= SG_OVERLOADED;
kernel/sched/fair.c: set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED);
kernel/sched/fair.c: sg_status & SG_OVERUTILIZED);
kernel/sched/fair.c: } else if (sg_status & SG_OVERUTILIZED) {
kernel/sched/fair.c: set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
kernel/sched/sched.h:#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */
kernel/sched/sched.h:#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
kernel/sched/sched.h: set_rd_overloaded(rq->rd, SG_OVERLOADED);

And use them separately, which results in suboptimal code:

/* update overload indicator if we are at root domain */
set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED);

/* Update over-utilization (tipping point, U >= 0) indicator */
set_rd_overutilized_status(env->dst_rq->rd,

Introduce separate sg_overloaded and sg_overutilized flags in update_sd_lb_stats()
and its lower level functions, and change all of them to 'bool'.

Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
kernel/sched/fair.c | 33 ++++++++++++++++-----------------
kernel/sched/sched.h | 17 ++++++-----------
2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f29efd5f19f6..ebc8d5f855de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6688,19 +6688,18 @@ static inline bool cpu_overutilized(int cpu)
/*
* overutilized value make sense only if EAS is enabled
*/
-static inline int is_rd_overutilized(struct root_domain *rd)
+static inline bool is_rd_overutilized(struct root_domain *rd)
{
return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
}

-static inline void set_rd_overutilized(struct root_domain *rd,
- unsigned int status)
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
{
if (!sched_energy_enabled())
return;

- WRITE_ONCE(rd->overutilized, status);
- trace_sched_overutilized_tp(rd, !!status);
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
}

static inline void check_update_overutilized_status(struct rq *rq)
@@ -6711,7 +6710,7 @@ static inline void check_update_overutilized_status(struct rq *rq)
*/

if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
- set_rd_overutilized(rq->rd, SG_OVERUTILIZED);
+ set_rd_overutilized(rq->rd, 1);
}
#else
static inline void check_update_overutilized_status(struct rq *rq) { }
@@ -9940,7 +9939,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
int i, nr_running, local_group;

@@ -9961,10 +9961,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;

if (nr_running > 1)
- *sg_status |= SG_OVERLOADED;
+ *sg_overloaded = 1;

if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;

#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -9986,7 +9986,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOADED;
+ *sg_overloaded = 1;
}
} else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
@@ -10612,7 +10612,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;

do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10628,7 +10628,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}

- update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);

if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -10657,13 +10657,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
- set_rd_overloaded(env->dst_rq->rd, sg_status & SG_OVERLOADED);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);

/* Update over-utilization (tipping point, U >= 0) indicator */
- set_rd_overutilized(env->dst_rq->rd,
- sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- set_rd_overutilized(env->dst_rq->rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overloaded);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}

update_idle_cpu_scan(env, sum_util);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 07c6669b8250..7c39dbf31f75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -713,7 +713,7 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- int overloaded;
+ bool overloaded;
struct plist_head pushable_tasks;

#endif /* CONFIG_SMP */
@@ -757,7 +757,7 @@ struct dl_rq {
u64 next;
} earliest_dl;

- int overloaded;
+ bool overloaded;

/*
* Tasks on this rq that can be pushed away. They are kept in
@@ -850,10 +850,6 @@ struct perf_domain {
struct rcu_head rcu;
};

-/* Scheduling group status flags */
-#define SG_OVERLOADED 0x1 /* More than one runnable task on a CPU. */
-#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
-
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -874,10 +870,10 @@ struct root_domain {
* - More than one runnable task
* - Running task is misfit
*/
- int overloaded;
+ bool overloaded;

/* Indicate one or more cpus over-utilized (tipping point) */
- int overutilized;
+ bool overutilized;

/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -2540,9 +2536,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}

#ifdef CONFIG_SMP
- if (prev_nr < 2 && rq->nr_running >= 2) {
- set_rd_overloaded(rq->rd, SG_OVERLOADED);
- }
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
#endif

sched_update_tick_dependency(rq);