Re: [RFC PATCH 3/4] sched/fair: Calculate the scan depth for idle balance based on system utilization

From: Chen Yu
Date: Wed Jun 21 2023 - 03:30:03 EST


Hi Gautham,
On 2023-06-15 at 11:31:07 +0530, Gautham R. Shenoy wrote:
> Hello Chen Yu,
>
>
> On Tue, Jun 13, 2023 at 12:18:57AM +0800, Chen Yu wrote:
> > When CPU is about to enter idle, it invokes newidle_balance() to pull
> > some tasks from other runqueues. Although there is per domain
> > max_newidle_lb_cost to throttle the newidle_balance(), it would be
> > good to further limit the scan based on overall system utilization.
> > The reason is that there is no limitation for newidle_balance() to
> > launch this balance simultaneously on multiple CPUs. Since each
> > newidle_balance() has to traverse all the CPUs to calculate the
> > statistics one by one, this total time cost on newidle_balance()
> > could be O(n^2). This is not good for performance or power saving.
> >
> > For example, sqlite has spent quite some time on newidle balance()
> > on Intel Sapphire Rapids, which has 2 x 56C/112T = 224 CPUs:
> > 6.69% 0.09% sqlite3 [kernel.kallsyms] [k] newidle_balance
> > 5.39% 4.71% sqlite3 [kernel.kallsyms] [k] update_sd_lb_stats
> >
> > Based on this observation, limit the scan depth of newidle_balance()
> > by considering the utilization of the LLC domain. Let the number of
> > scanned groups be a linear function of the utilization ratio:
> >
>
> Is there any particular reason why this is being limited only to the
> LLC domain ?
>
> On architectures where the LLC domain may not be so large (POWER9/10,
> AMD), the additional cost is usually paid at the higher domains where
> the number of groups is greater / equal to the number of groups in the
> LLC domain and where sd_span is pretty large. It would be good to
> explore avoiding the scan cost on those domains as well, right?
>
> > nr_groups_to_scan = nr_groups * (1 - util_ratio)
>
> If we can extend this logic to higher domains, on a Zen3 1 Socket
> server with 128 CPUs at the DIE domain containing 8 groups, we can
> expect a significant reduction in the time spent doing
> update_sg_lb_stats() at higher utilizations.
>
> util_ratio nr_groups_to_scan nr_cpus_scanned
> ========================================================
> 0.9 1 16 (-87.5%)
> 0.75 2 32 (-75%)
> 0.5 4 64 (-50%)
> 0.25 6 96 (-25%)
> 0.1 7 112 (-12.5%)
>
>
> On a Zen 4 1 socket server with 192 CPUs at the DIE domain containing
> 12 groups, values will be:
>
> util_ratio nr_groups_to_scan nr_cpus_scanned
> ========================================================
> 0.9 1 16 (-91%)
> 0.75 3 48 (-75%)
> 0.5 6 96 (-50%)
> 0.25 9 144 (-25%)
> 0.1 10 160 (-16.7%)
>
I have an idea to limit scan depth for newidle balance for big domains.
These domains should have CPUs higher than/equals to LLC(MC domain).
However it seems that in current kernel only domain with SD_SHARE_PKG_RESOURCES
flag set will have the shared struct sched_domain_shared among the CPUs in this
domain. And this is reasonable because the cost to access the struct sched_domain_shared
is lower if the CPUs share cache. Since ILB_UTIL relies on the sched_domain_shared
to get the scan depth, I removed the restriction of SD_SHARE_PKG_RESOURCES
during sched_domain_shared assignment.
If non-LLC domain's sched_domain_shared is only used for ILB_UTIL,
the overhead should be not too high(only periodic load balance would
write to sched_domain_shared). Here is a untest patch which shows what
I'm thinking of, and I'll do some refinement based on this:

thanks,
Chenyu

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..ce7ffbb7b3f8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,6 +82,10 @@ struct sched_domain_shared {
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
+ /* ilb scan depth and load balance statistic snapshot */
+ int ilb_nr_scan;
+ unsigned long ilb_total_load;
+ unsigned long ilb_total_capacity;
};

struct sched_domain {
@@ -152,6 +156,7 @@ struct sched_domain {
struct sched_domain_shared *shared;

unsigned int span_weight;
+ unsigned int nr_groups;
/*
* Span of all CPUs in this domain.
*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d724215826ae..34619dbb2f4e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10162,6 +10162,54 @@ static void update_idle_cpu_scan(struct lb_env *env,
WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
}

+/*
+ * Get the domain shared information of dst CPU.
+ */
+static struct sched_domain_shared *get_sd_shared(struct lb_env *env)
+{
+ /*
+ * Do not consider the domains smaller than LLC because those
+ * small domains have low cost on idle load balance.
+ */
+ if (env->sd->span_weight < per_cpu(sd_llc_size, env->dst_cpu))
+ return NULL;
+
+ return env->sd->shared;
+}
+
+static void update_ilb_group_scan(struct lb_env *env,
+ unsigned long sum_util,
+ struct sched_domain_shared *sd_share,
+ struct sd_lb_stats *sds)
+{
+ u64 tmp, nr_scan;
+
+ if (!sched_feat(ILB_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ if(!sd_share)
+ return;
+ /*
+ * Limit the newidle balance scan depth based on overall system
+ * utilization:
+ * nr_groups_scan = nr_groups * (1 - util_ratio)
+ * and util_ratio = sum_util / (sd_weight * SCHED_CAPACITY_SCALE)
+ */
+ nr_scan = env->sd->nr_groups * sum_util;
+ tmp = env->sd->span_weight * SCHED_CAPACITY_SCALE;
+ do_div(nr_scan, tmp);
+ nr_scan = env->sd->nr_groups - nr_scan;
+ if ((int)nr_scan != sd_share->ilb_nr_scan)
+ WRITE_ONCE(sd_share->ilb_nr_scan, (int)nr_scan);
+
+ /* save the statistic snapshot of the periodic load balance */
+ if (sds->total_load != sd_share->ilb_total_load)
+ WRITE_ONCE(sd_share->ilb_total_load, sds->total_load);
+
+ if (sds->total_capacity != sd_share->ilb_total_capacity)
+ WRITE_ONCE(sd_share->ilb_total_capacity, sds->total_capacity);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -10170,11 +10218,17 @@ static void update_idle_cpu_scan(struct lb_env *env,

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *sd_share = get_sd_shared(env);
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ int sg_status = 0, nr_scan_ilb;
+ bool ilb_util_enabled = sched_feat(ILB_UTIL) && env->idle == CPU_NEWLY_IDLE &&
+ sd_share && READ_ONCE(sd_share->ilb_total_capacity);
+
+ if (ilb_util_enabled)
+ nr_scan_ilb = sd_share->ilb_nr_scan;

do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10192,6 +10246,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

update_sg_lb_stats(env, sds, sg, sgs, &sg_status);

+ if (ilb_util_enabled && --nr_scan_ilb <= 0) {
+ /*
+ * Borrow the statistic of previous periodic load balance.
+ * The data might be stale and it is a trade-off.
+ */
+ sds->total_load = READ_ONCE(sd_share->ilb_total_load);
+ sds->total_capacity = READ_ONCE(sd_share->ilb_total_capacity);
+
+ break;
+ }
+
if (local_group)
goto next_group;

@@ -10239,6 +10304,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}

update_idle_cpu_scan(env, sum_util);
+ update_ilb_group_scan(env, sum_util, sd_share, sds);
}

/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..8f6e5b08408d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,6 +85,7 @@ SCHED_FEAT(RT_PUSH_IPI, true)

SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ILB_UTIL, true)
SCHED_FEAT(ATTACH_AGE_LOAD, true)

SCHED_FEAT(WA_IDLE, true)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4..98bfac5f7836 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1023,7 +1023,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
- int i;
+ int i, nr_groups = 0;

cpumask_clear(covered);

@@ -1087,6 +1087,8 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (!sg)
goto fail;

+ nr_groups++;
+
sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);

@@ -1100,6 +1102,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
last->next = first;
}
sd->groups = first;
+ sd->nr_groups = nr_groups;

return 0;

@@ -1233,7 +1236,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
struct sd_data *sdd = sd->private;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered;
- int i;
+ int i, nr_groups = 0;

lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
@@ -1248,6 +1251,8 @@ build_sched_groups(struct sched_domain *sd, int cpu)

sg = get_group(i, sdd);

+ nr_groups++;
+
cpumask_or(covered, covered, sched_group_span(sg));

if (!first)
@@ -1258,6 +1263,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
last->next = first;
sd->groups = first;
+ sd->nr_groups = nr_groups;

return 0;
}
@@ -1641,14 +1647,12 @@ sd_init(struct sched_domain_topology_level *tl,
}

/*
- * For all levels sharing cache; connect a sched_domain_shared
+ * For all levels; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);

sd->private = sdd;

--
2.25.1