Re: [PATCH v10 2/3] sched/fair: Scan cluster before scanning LLC in wake-up path

From: Gautham R. Shenoy
Date: Tue Oct 17 2023 - 01:02:53 EST


Hello Yicong, Barry,

On Thu, Oct 12, 2023 at 08:17:06PM +0800, Yicong Yang wrote:
> From: Barry Song <song.bao.hua@xxxxxxxxxxxxx>
>
> For platforms having clusters like Kunpeng920, CPUs within the same cluster
> have lower latency when synchronizing and accessing shared resources like
> cache. Thus, this patch tries to find an idle cpu within the cluster of the
> target CPU before scanning the whole LLC to gain lower latency. This
> will be implemented in 2 steps in select_idle_sibling():
> 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them
> if they're sharing cluster with the target CPU. Otherwise trying to
> scan for an idle CPU in the target's cluster.
> 2. Scanning the cluster prior to the LLC of the target CPU for an
> idle CPU to wakeup.
>
> Testing has been done on Kunpeng920 by pinning tasks to one numa and two
> numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs.
>
> With this patch, We noticed enhancement on tbench and netperf within one
> numa or cross two numa on top of tip-sched-core commit
> 9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()")
>
> tbench results (node 0):
> baseline patched
> 1: 327.2833 372.4623 ( 13.80%)
> 4: 1320.5933 1479.8833 ( 12.06%)
> 8: 2638.4867 2921.5267 ( 10.73%)
> 16: 5282.7133 5891.5633 ( 11.53%)
> 32: 9810.6733 9877.3400 ( 0.68%)
> 64: 7408.9367 7447.9900 ( 0.53%)
> 128: 6203.2600 6191.6500 ( -0.19%)
> tbench results (node 0-1):
> baseline patched
> 1: 332.0433 372.7223 ( 12.25%)
> 4: 1325.4667 1477.6733 ( 11.48%)
> 8: 2622.9433 2897.9967 ( 10.49%)
> 16: 5218.6100 5878.2967 ( 12.64%)
> 32: 10211.7000 11494.4000 ( 12.56%)
> 64: 13313.7333 16740.0333 ( 25.74%)
> 128: 13959.1000 14533.9000 ( 4.12%)
>
> netperf results TCP_RR (node 0):
> baseline patched
> 1: 76546.5033 90649.9867 ( 18.42%)
> 4: 77292.4450 90932.7175 ( 17.65%)
> 8: 77367.7254 90882.3467 ( 17.47%)
> 16: 78519.9048 90938.8344 ( 15.82%)
> 32: 72169.5035 72851.6730 ( 0.95%)
> 64: 25911.2457 25882.2315 ( -0.11%)
> 128: 10752.6572 10768.6038 ( 0.15%)
>
> netperf results TCP_RR (node 0-1):
> baseline patched
> 1: 76857.6667 90892.2767 ( 18.26%)
> 4: 78236.6475 90767.3017 ( 16.02%)
> 8: 77929.6096 90684.1633 ( 16.37%)
> 16: 77438.5873 90502.5787 ( 16.87%)
> 32: 74205.6635 88301.5612 ( 19.00%)
> 64: 69827.8535 71787.6706 ( 2.81%)
> 128: 25281.4366 25771.3023 ( 1.94%)
>
> netperf results UDP_RR (node 0):
> baseline patched
> 1: 96869.8400 110800.8467 ( 14.38%)
> 4: 97744.9750 109680.5425 ( 12.21%)
> 8: 98783.9863 110409.9637 ( 11.77%)
> 16: 99575.0235 110636.2435 ( 11.11%)
> 32: 95044.7250 97622.8887 ( 2.71%)
> 64: 32925.2146 32644.4991 ( -0.85%)
> 128: 12859.2343 12824.0051 ( -0.27%)
>
> netperf results UDP_RR (node 0-1):
> baseline patched
> 1: 97202.4733 110190.1200 ( 13.36%)
> 4: 95954.0558 106245.7258 ( 10.73%)
> 8: 96277.1958 105206.5304 ( 9.27%)
> 16: 97692.7810 107927.2125 ( 10.48%)
> 32: 79999.6702 103550.2999 ( 29.44%)
> 64: 80592.7413 87284.0856 ( 8.30%)
> 128: 27701.5770 29914.5820 ( 7.99%)
>
> Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch
> in the code has not been tested but it supposed to work.
>
> Chen Yu also noticed this will improve the performance of tbench and
> netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one
> cluster sharing L2 Cache.
>
> Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
> Tested-by: Yicong Yang <yangyicong@xxxxxxxxxxxxx>
> Signed-off-by: Barry Song <song.bao.hua@xxxxxxxxxxxxx>
> Signed-off-by: Yicong Yang <yangyicong@xxxxxxxxxxxxx>
> Reviewed-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Reviewed-by: Chen Yu <yu.c.chen@xxxxxxxxx>

This version looks good to me.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@xxxxxxx>

--
Thanks and Regards
gautham.

> ---
> kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++----
> kernel/sched/sched.h | 1 +
> kernel/sched/topology.c | 12 ++++++++++++
> 3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 666ece65987f..4039f9b348ec 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> }
> }
>
> + if (static_branch_unlikely(&sched_cluster_active)) {
> + struct sched_group *sg = sd->groups;
> +
> + if (sg->flags & SD_CLUSTER) {
> + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
> + if (!cpumask_test_cpu(cpu, cpus))
> + continue;
> +
> + if (has_idle_core) {
> + i = select_idle_core(p, cpu, cpus, &idle_cpu);
> + if ((unsigned int)i < nr_cpumask_bits)
> + return i;
> + } else {
> + if (--nr <= 0)
> + return -1;
> + idle_cpu = __select_idle_cpu(cpu, p);
> + if ((unsigned int)idle_cpu < nr_cpumask_bits)
> + return idle_cpu;
> + }
> + }
> + cpumask_andnot(cpus, cpus, sched_group_span(sg));
> + }
> + }
> +
> for_each_cpu_wrap(cpu, cpus, target + 1) {
> if (has_idle_core) {
> i = select_idle_core(p, cpu, cpus, &idle_cpu);
> @@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> return i;
>
> } else {
> - if (!--nr)
> + if (--nr <= 0)
> return -1;
> idle_cpu = __select_idle_cpu(cpu, p);
> if ((unsigned int)idle_cpu < nr_cpumask_bits)
> @@ -7395,8 +7419,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> */
> if (prev != target && cpus_share_cache(prev, target) &&
> (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> - asym_fits_cpu(task_util, util_min, util_max, prev))
> - return prev;
> + asym_fits_cpu(task_util, util_min, util_max, prev)) {
> + if (!static_branch_unlikely(&sched_cluster_active))
> + return prev;
> +
> + if (cpus_share_resources(prev, target))
> + return prev;
> + }
>
> /*
> * Allow a per-cpu kthread to stack with the wakee if the
> @@ -7423,7 +7452,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
> cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
> asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
> - return recent_used_cpu;
> + if (!static_branch_unlikely(&sched_cluster_active))
> + return recent_used_cpu;
> +
> + if (cpus_share_resources(recent_used_cpu, target))
> + return recent_used_cpu;
> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 998f03d02de0..ef4fe7bcf740 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> extern struct static_key_false sched_asym_cpucapacity;
> +extern struct static_key_false sched_cluster_active;
>
> static __always_inline bool sched_asym_cpucap_active(void)
> {
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 48cd88350d18..925cd68abc8b 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> +
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> +DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>
> static void update_top_cache_domain(int cpu)
> {
> @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> struct rq *rq = NULL;
> int i, ret = -ENOMEM;
> bool has_asym = false;
> + bool has_cluster = false;
>
> if (WARN_ON(cpumask_empty(cpu_map)))
> goto error;
> @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
>
> cpu_attach_domain(sd, d.rd, i);
> +
> + if (lowest_flag_domain(i, SD_CLUSTER))
> + has_cluster = true;
> }
> rcu_read_unlock();
>
> if (has_asym)
> static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
>
> + if (has_cluster)
> + static_branch_inc_cpuslocked(&sched_cluster_active);
> +
> if (rq && sched_debug_verbose) {
> pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
> cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
> @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
> if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
> static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
>
> + if (static_branch_unlikely(&sched_cluster_active))
> + static_branch_dec_cpuslocked(&sched_cluster_active);
> +
> rcu_read_lock();
> for_each_cpu(i, cpu_map)
> cpu_attach_domain(NULL, &def_root_domain, i);
> --
> 2.24.0
>