[RFC PATCH 3/4] sched/fair: Calculate the scan depth for idle balance based on system utilization

From: Chen Yu
Date: Mon Jun 12 2023 - 05:01:33 EST


When CPU is about to enter idle, it invokes newidle_balance() to pull
some tasks from other runqueues. Although there is per domain
max_newidle_lb_cost to throttle the newidle_balance(), it would be
good to further limit the scan based on overall system utilization.
The reason is that there is no limitation for newidle_balance() to
launch this balance simultaneously on multiple CPUs. Since each
newidle_balance() has to traverse all the CPUs to calculate the
statistics one by one, this total time cost on newidle_balance()
could be O(n^2). This is not good for performance or power saving.

For example, sqlite has spent quite some time on newidle balance()
on Intel Sapphire Rapids, which has 2 x 56C/112T = 224 CPUs:
6.69% 0.09% sqlite3 [kernel.kallsyms] [k] newidle_balance
5.39% 4.71% sqlite3 [kernel.kallsyms] [k] update_sd_lb_stats

Based on this observation, limit the scan depth of newidle_balance()
by considering the utilization of the LLC domain. Let the number of
scanned groups be a linear function of the utilization ratio:

nr_groups_to_scan = nr_groups * (1 - util_ratio)

Besides, save the total_load, total_capacity of the current
sched domain in each periodic load balance. This statistic
can be reused later by CPU_NEWLY_IDLE load balance if it quits
the scan earlier. Introduce a sched feature ILB_UTIL to
control this.

Suggested-by: Tim Chen <tim.c.chen@xxxxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
---
include/linux/sched/topology.h | 4 ++++
kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++++++
kernel/sched/features.h | 1 +
3 files changed, 39 insertions(+)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 1faececd5694..d7b2bac9bdf3 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,6 +82,10 @@ struct sched_domain_shared {
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
+ /* ilb scan depth and load balance statistic snapshot */
+ int ilb_nr_scan;
+ unsigned long ilb_total_load;
+ unsigned long ilb_total_capacity;
};

struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3a24aead848..f999e838114e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10122,6 +10122,39 @@ static void update_idle_cpu_scan(struct lb_env *env,
WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
}

+static void update_ilb_group_scan(struct lb_env *env,
+ unsigned long sum_util,
+ struct sched_domain_shared *sd_share,
+ struct sd_lb_stats *sds)
+{
+ u64 tmp, nr_scan;
+
+ if (!sched_feat(ILB_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ if (!sd_share)
+ return;
+ /*
+ * Limit the newidle balance scan depth based on overall system
+ * utilization:
+ * nr_groups_scan = nr_groups * (1 - util_ratio)
+ * and util_ratio = sum_util / (sd_weight * SCHED_CAPACITY_SCALE)
+ */
+ nr_scan = env->sd->nr_groups * sum_util;
+ tmp = env->sd->span_weight * SCHED_CAPACITY_SCALE;
+ do_div(nr_scan, tmp);
+ nr_scan = env->sd->nr_groups - nr_scan;
+ if ((int)nr_scan != sd_share->ilb_nr_scan)
+ WRITE_ONCE(sd_share->ilb_nr_scan, (int)nr_scan);
+
+ /* Also save the statistic snapshot of the periodic load balance */
+ if (sds->total_load != sd_share->ilb_total_load)
+ WRITE_ONCE(sd_share->ilb_total_load, sds->total_load);
+
+ if (sds->total_capacity != sd_share->ilb_total_capacity)
+ WRITE_ONCE(sd_share->ilb_total_capacity, sds->total_capacity);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -10200,6 +10233,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}

update_idle_cpu_scan(env, sum_util, sd_share);
+ update_ilb_group_scan(env, sum_util, sd_share, sds);
}

/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..8f6e5b08408d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,6 +85,7 @@ SCHED_FEAT(RT_PUSH_IPI, true)

SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ILB_UTIL, true)
SCHED_FEAT(ATTACH_AGE_LOAD, true)

SCHED_FEAT(WA_IDLE, true)
--
2.25.1