Re: [PATCH 2/2] sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs

From: Vincent Guittot
Date: Tue Dec 21 2021 - 12:13:32 EST


On Mon, 20 Dec 2021 at 12:12, Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> wrote:
>
> (sorry for the delay, was offline for a few days)
>
> On Fri, Dec 17, 2021 at 12:03:06AM +0530, Gautham R. Shenoy wrote:
> > Hello Mel,
> >
> > On Wed, Dec 15, 2021 at 12:25:50PM +0000, Mel Gorman wrote:
> > > On Wed, Dec 15, 2021 at 05:22:30PM +0530, Gautham R. Shenoy wrote:
> >
> > [..SNIP..]
> >

[snip]

>
> To avoid the corner case, we'd need to explicitly favour spreading early
> and assume wakeup will pull communicating tasks together and NUMA
> balancing migrate the data after some time which looks like
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index c07bfa2d80f2..54f5207154d3 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -93,6 +93,7 @@ struct sched_domain {
> unsigned int busy_factor; /* less balancing by factor if busy */
> unsigned int imbalance_pct; /* No balance until over watermark */
> unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
> + unsigned int imb_numa_nr; /* Nr imbalanced tasks allowed between nodes */

So now you compute an allowed imbalance level instead of using
25% of sd->span_weight
or
25% of busiest->group_weight

And you adjust this new imb_numa_nr according to the topology.

That makes sense.

>
> int nohz_idle; /* NOHZ IDLE status */
> int flags; /* See SD_* */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0a969affca76..df0e84462e62 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1489,6 +1489,7 @@ struct task_numa_env {
>
> int src_cpu, src_nid;
> int dst_cpu, dst_nid;
> + int imb_numa_nr;
>
> struct numa_stats src_stats, dst_stats;
>
> @@ -1504,7 +1505,8 @@ static unsigned long cpu_load(struct rq *rq);
> static unsigned long cpu_runnable(struct rq *rq);
> static unsigned long cpu_util(int cpu);
> static inline long adjust_numa_imbalance(int imbalance,
> - int dst_running, int dst_weight);
> + int dst_running,
> + int imb_numa_nr);
>
> static inline enum
> numa_type numa_classify(unsigned int imbalance_pct,
> @@ -1885,7 +1887,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
> dst_running = env->dst_stats.nr_running + 1;
> imbalance = max(0, dst_running - src_running);
> imbalance = adjust_numa_imbalance(imbalance, dst_running,
> - env->dst_stats.weight);
> + env->imb_numa_nr);
>
> /* Use idle CPU if there is no imbalance */
> if (!imbalance) {
> @@ -1950,8 +1952,10 @@ static int task_numa_migrate(struct task_struct *p)
> */
> rcu_read_lock();
> sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
> - if (sd)
> + if (sd) {
> env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
> + env.imb_numa_nr = sd->imb_numa_nr;
> + }
> rcu_read_unlock();
>
> /*
> @@ -9050,9 +9054,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
> * This is an approximation as the number of running tasks may not be
> * related to the number of busy CPUs due to sched_setaffinity.
> */
> -static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
> +static inline bool allow_numa_imbalance(int dst_running, int imb_numa_nr)
> {
> - return (dst_running < (dst_weight >> 2));
> + return dst_running < imb_numa_nr;
> }
>
> /*
> @@ -9186,12 +9190,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
> return idlest;
> #endif
> /*
> - * Otherwise, keep the task on this node to stay close
> - * its wakeup source and improve locality. If there is
> - * a real need of migration, periodic load balance will
> - * take care of it.
> + * Otherwise, keep the task on this node to stay local
> + * to its wakeup source if the number of running tasks
> + * are below the allowed imbalance. If there is a real
> + * need of migration, periodic load balance will take
> + * care of it.
> */
> - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
> + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->imb_numa_nr))
> return NULL;
> }
>
> @@ -9280,19 +9285,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> }
> }
>
> -#define NUMA_IMBALANCE_MIN 2
> -
> static inline long adjust_numa_imbalance(int imbalance,
> - int dst_running, int dst_weight)
> + int dst_running, int imb_numa_nr)
> {
> - if (!allow_numa_imbalance(dst_running, dst_weight))
> + if (!allow_numa_imbalance(dst_running, imb_numa_nr))
> return imbalance;
>
> - /*
> - * Allow a small imbalance based on a simple pair of communicating
> - * tasks that remain local when the destination is lightly loaded.
> - */
> - if (imbalance <= NUMA_IMBALANCE_MIN)
> + if (imbalance <= imb_numa_nr)

Isn't this always true ?

imbalance is "always" < dst_running as imbalance is usually the number
of these tasks that we would like to migrate


> return 0;
>
> return imbalance;
> @@ -9397,7 +9396,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> /* Consider allowing a small imbalance between NUMA groups */
> if (env->sd->flags & SD_NUMA) {
> env->imbalance = adjust_numa_imbalance(env->imbalance,
> - busiest->sum_nr_running, env->sd->span_weight);
> + busiest->sum_nr_running, env->sd->imb_numa_nr);
> }
>
> return;
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d201a7052a29..1fa3e977521d 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2242,6 +2242,55 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
> }
>
> + /*
> + * Calculate an allowed NUMA imbalance such that LLCs do not get
> + * imbalanced.
> + */
> + for_each_cpu(i, cpu_map) {
> + unsigned int imb = 0;
> + unsigned int imb_span = 1;
> +
> + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> + struct sched_domain *child = sd->child;
> +
> + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
> + (child->flags & SD_SHARE_PKG_RESOURCES)) {

sched_domains have not been degenerated yet so you found here the DIE domain

> + struct sched_domain *top, *top_p;
> + unsigned int llc_sq;
> +
> + /*
> + * nr_llcs = (sd->span_weight / llc_weight);
> + * imb = (llc_weight / nr_llcs) >> 2

it would be good to add a comment to explain why 25% of LLC weight /
number of LLC in a node is the right value.
For example, why is it better than just 25% of the LLC weight ?
Do you want to allow the same imbalance at node level whatever the
number of LLC in the node ?

> + *
> + * is equivalent to
> + *
> + * imb = (llc_weight^2 / sd->span_weight) >> 2
> + *
> + */
> + llc_sq = child->span_weight * child->span_weight;
> +
> + imb = max(2U, ((llc_sq / sd->span_weight) >> 2));
> + sd->imb_numa_nr = imb;
> +
> + /*
> + * Set span based on top domain that places
> + * tasks in sibling domains.
> + */
> + top = sd;
> + top_p = top->parent;
> + while (top_p && (top_p->flags & SD_PREFER_SIBLING)) {

Why are you looping on SD_PREFER_SIBLING instead of SD_NUMA ?
Apart the heterogeneous domain (SD_ASYM_CPUCAPACITY) but I'm not sure
that you want to take this case into account, only numa node don't
have SD_PREFER_SIBLING

> + top = top->parent;
> + top_p = top->parent;
> + }
> + imb_span = top_p ? top_p->span_weight : sd->span_weight;
> + } else {
> + int factor = max(1U, (sd->span_weight / imb_span));
> +
> + sd->imb_numa_nr = imb * factor;
> + }
> + }
> + }
> +
> /* Calculate CPU capacity for physical packages and nodes */
> for (i = nr_cpumask_bits-1; i >= 0; i--) {
> if (!cpumask_test_cpu(i, cpu_map))