[PATCH v4 7/7] sched/fair: de-entropy for SIS filter

From: Abel Wu
Date: Sun Jun 19 2022 - 08:05:59 EST


Now when updating core state, there are two main problems that could
pollute the SIS filter:

- The updating is before task migration, so if dst_cpu is
selected to be propagated which might be fed with tasks
soon, the efforts we paid is no more than setting a busy
cpu into the SIS filter. While on the other hand it is
important that we update as early as possible to keep the
filter fresh, so it's not wise to delay the update to the
end of load balancing.

- False negative propagation hurts performance since some
idle cpus could be out of reach. So in general we will
aggressively propagate idle cpus but allow false positive
continue to exist for a while, which may lead to filter
being fully polluted.

Pains can be relieved by a force correction when false positive
continuously detected.

Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
---
include/linux/sched/topology.h | 7 +++++
kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++--
2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index b93edf587d84..e3552ce192a9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -91,6 +91,12 @@ struct sched_group;
* search, and is also used as a fallback state of the other
* states.
*
+ * - sd_may_idle
+ * This state implies the unstableness of the SIS filter, and
+ * some bits of it may out of date. This state is only used in
+ * SMT domains as an intermediate state between sd_has_icpus
+ * and sd_is_busy.
+ *
* - sd_is_busy
* This state indicates there are no unoccupied cpus in this
* domain. So for LLC domains, it gives the hint on whether
@@ -111,6 +117,7 @@ struct sched_group;
enum sd_state {
sd_has_icores,
sd_has_icpus,
+ sd_may_idle,
sd_is_busy
};

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d55fdcedf2c0..9713d183d35e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8768,6 +8768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,

for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ bool update = update_core && (env->dst_cpu != i);

sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util_cfs(i);
@@ -8777,7 +8778,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;

- if (update_core)
+ /*
+ * The dst_cpu is not preferred since it might
+ * be fed with tasks soon.
+ */
+ if (update)
sd_classify(sds, rq, i);

if (nr_running > 1)
@@ -8801,7 +8806,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
* and fed with tasks, so we'd better choose
* a candidate in an opposite way.
*/
- sds->idle_cpu = i;
+ if (update)
+ sds->idle_cpu = i;
sgs->idle_cpus++;

/* Idle cpu can't have misfit task */
@@ -9321,7 +9327,7 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain_shared *sd_smt_shared = env->sd->shared;
enum sd_state new = sds->sd_state;
- int this = env->dst_cpu;
+ int icpu = sds->idle_cpu, this = env->dst_cpu;

/*
* Parallel updating can hardly contribute accuracy to
@@ -9331,6 +9337,22 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
if (cmpxchg(&sd_smt_shared->updating, 0, 1))
return;

+ /*
+ * The dst_cpu is likely to be fed with tasks soon.
+ * If it is the only unoccupied cpu in this domain,
+ * we still handle it the same way as as_has_icpus
+ * but turn the SMT into the unstable state, rather
+ * than waiting to the end of load balancing since
+ * it's also important that update the filter as
+ * early as possible to keep it fresh.
+ */
+ if (new == sd_is_busy) {
+ if (idle_cpu(this) || sched_idle_cpu(this)) {
+ new = sd_may_idle;
+ icpu = this;
+ }
+ }
+
/*
* There is at least one unoccupied cpu available, so
* propagate it to the filter to avoid false negative
@@ -9338,6 +9360,12 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
* idle cpus thus throughupt downgraded.
*/
if (new != sd_is_busy) {
+ /*
+ * The sd_may_idle state is taken into
+ * consideration as well because from
+ * here we couldn't actually know task
+ * migrations would happen or not.
+ */
if (!test_idle_cpus(this))
set_idle_cpus(this, true);
} else {
@@ -9347,9 +9375,26 @@ static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
*/
if (sd_smt_shared->state == sd_is_busy)
goto out;
+
+ /*
+ * Allow false positive to exist for some time
+ * to make a tradeoff of accuracy of the filter
+ * for relieving cache traffic.
+ */
+ if (sd_smt_shared->state == sd_has_icpus) {
+ new = sd_may_idle;
+ goto update;
+ }
+
+ /*
+ * If the false positive issue has already been
+ * there for a while, a correction of the filter
+ * is needed.
+ */
}

sd_update_icpus(this, sds->idle_cpu);
+update:
sd_smt_shared->state = new;
out:
xchg(&sd_smt_shared->updating, 0);
--
2.31.1