Re: [tip: sched/core] sched/fair: Multi-LLC select_idle_sibling()

From: Peter Zijlstra
Date: Thu Jun 01 2023 - 11:35:33 EST


On Thu, Jun 01, 2023 at 04:47:06PM +0200, Peter Zijlstra wrote:

> One way to fix all this would be by having arch/x86/kernel/smpboot.c set
> an AMD specific set_sched_topology() that has a CCD domain above the MC
> and below the DIE domain that groups 'near' CCDs together based on some
> AMD specific topology information.
>
> Then for small systems that will probably be just a single CCD domain
> and the degenerate code will make it go away, but for these large
> systems it will do what is right for their respective configuration.
>
> Then, since this new multi-llc code uses MC->parent it will end up on
> the fancy new CCD domain and not scan the *entire* socket.
>
> Hmm?

Something like the (untested) below might be a nice base to go from.

Then all you have to do is add something like:

if (x86_has_ccd_topology) {
x86_topology[i++] = (struct sched_domain_topology_level){
cpu_ccd_mask, SD_INIT_NAME(CCD)
};
}

(and construct cpu_ccd_mask obviously...)

---
arch/x86/kernel/smpboot.c | 94 ++++++++++++++++++++++-------------------------
1 file changed, 43 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 34066f6735dd..0a22d719b6b6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -563,50 +563,57 @@ static int x86_cluster_flags(void)
#endif
#endif

-static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
- { NULL, },
-};
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;

-static struct sched_domain_topology_level x86_hybrid_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) },
- { NULL, },
-};
+static struct sched_domain_topology_level x86_topology[6];
+
+static void __init build_sched_topology(void)
+{
+ int i = 0;

-static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+ };
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+ /*
+ * For now, skip the cluster domain on Hybrid.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+ };
+ }
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
+ };
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
+ /*
+ * When there is NUMA topology inside the package skip the DIE domain
+ * since the NUMA domains will auto-magically create the right spanning
+ * domains based on the SLIT.
+ */
+ if (!x86_has_numa_in_package) {
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_cpu_mask, SD_INIT_NAME(DIE)
+ };
+ }

-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
+ /*
+ * There must be one trailing NULL entry left.
+ */
+ BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+
+ set_sched_topology(x86_topology);
+}

void set_cpu_sibling_map(int cpu)
{
@@ -1390,15 +1397,6 @@ void __init smp_prepare_cpus_common(void)
zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}

- /*
- * Set 'default' x86 topology, this matches default_topology() in that
- * it has NUMA nodes as a topology level. See also
- * native_smp_cpus_done().
- *
- * Must be done before set_cpus_sibling_map() is ran.
- */
- set_sched_topology(x86_topology);
-
set_cpu_sibling_map(0);
}

@@ -1490,13 +1488,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done\n");

calculate_max_logical_packages();
-
- /* XXX for now assume numa-in-package and hybrid don't overlap */
- if (x86_has_numa_in_package)
- set_sched_topology(x86_numa_in_package_topology);
- if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- set_sched_topology(x86_hybrid_topology);
-
+ build_sched_topology();
nmi_selftest();
impress_friends();
cache_aps_init();