Re: [PATCH v4 5/7] sched/fair: skip SIS domain search if fully busy

From: Abel Wu
Date: Mon Aug 15 2022 - 05:49:18 EST


Hi Gautham, thanks for your reviewing and sorry for my late reply..

On 7/20/22 11:34 PM, Gautham R. Shenoy Wrote:

[..snip..]

@@ -6197,24 +6201,44 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
-static inline void set_idle_cores(int cpu, int val)
+static inline void sd_set_state(int cpu, enum sd_state state)

Nit: We are setting the state of only the LLC domain and not any other
domain via this function. So should we name it as
set_llc_state()/get_llc_state() for better readability ?


Makes sense, will rename in next version.


{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
- WRITE_ONCE(sds->has_idle_cores, val);
+ WRITE_ONCE(sds->state, state);
}
-static inline bool test_idle_cores(int cpu)
+static inline enum sd_state sd_get_state(int cpu)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
- return READ_ONCE(sds->has_idle_cores);
+ return READ_ONCE(sds->state);
- return false;
+ return sd_has_icpus;
+}
+
+static inline void set_idle_cores(int cpu, int idle)
^^^^^
I agree with Josh. We can use core_idle instead of idle here.

OK, I will make the param more verbose...


+{
+ sd_set_state(cpu, idle ? sd_has_icores : sd_has_icpus);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ return sd_get_state(cpu) == sd_has_icores;
+}
+
+static inline void set_idle_cpus(int cpu, int idle)

and this one too.

+{
+ sd_set_state(cpu, idle ? sd_has_icpus : sd_is_busy);
+}
+
+static inline bool test_idle_cpus(int cpu)
+{
+ return sd_get_state(cpu) != sd_is_busy;
}
/*

[...]


@@ -8661,6 +8702,12 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline void sd_classify(struct sd_lb_stats *sds, struct rq *rq)
+{
+ if (sds->sd_state != sd_has_icpus && unoccupied_rq(rq))

Nit: sds->sd_state can either be sd_has_icpus or sd_is_busy. So for
better readability, we can just use the positive check

For now, yes. But sd_state can be expanded and once that happens, the
positive check could be error prone.


if (sds->sd_state == sd_is_busy && unoccupied_rq(rq))
sds->sd_state = sd_has_icpus;


+ sds->sd_state = sd_has_icpus;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8675,11 +8722,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, update_core;
memset(sgs, 0, sizeof(*sgs));
local_group = group == sds->local;
+ update_core = env->sd->flags & SD_SHARE_CPUCAPACITY;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
@@ -8692,6 +8740,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
+ if (update_core)
+ sd_classify(sds, rq);
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -9220,6 +9271,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ if (sds->sd_state == sd_has_icpus && !test_idle_cpus(env->dst_cpu))
+ set_idle_cpus(env->dst_cpu, true);

We could enter this if condition when env->dst_cpu is the only idle
CPU in the SMT domain (which is likely to be the case every time we do
a NEW_IDLE balance). By the end of this load-balancing round, the
env->dst_cpu can pull a task from some other CPU and thereby no longer
remain idle but the LLC state would still be sd_has_icpus.

That would mean that some CPU on this LLC would do a full scan during
the wakeup only to find no idle CPU and reset the state to
sd_is_busy. Have you seen instances where this false-positive pattern
can result in wasteful scan thereby cause a performance degradation?
Ideally it should not be worse that what we currently have.

Yes, indeed. We will talk about this later in the 7th patch.


Apart from this, patch looks good to me.

Thanks!


I would be worth the while to explore if the LLC state can be used
early on in select_task_rq_fair() to determine if we need to do a
wake-affine or allow the task to stick to its previous LLC depending
on which among the previous LLC and the waker's LLC have an idle CPU.


Sounds like a good idea!

Best Regards,
Abel