[GIT PULL] scheduler fixes

From: Ingo Molnar
Date: Sun Sep 17 2023 - 13:40:53 EST



Linus,

Please pull the latest sched/urgent git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-2023-09-17

# HEAD: 108af4b4bd3813610701379a58538e3339b162e4 x86/sched: Restore the SD_ASYM_PACKING flag in the DIE domain

Fix a performance regression on large SMT systems, an Intel SMT4
balancing bug, and a topology setup bug on (Intel) hybrid processors.

Thanks,

Ingo

------------------>
Ricardo Neri (1):
x86/sched: Restore the SD_ASYM_PACKING flag in the DIE domain

Shrikanth Hegde (1):
sched/fair: Optimize should_we_balance() for large SMT systems

Tim Chen (1):
sched/fair: Fix SMT4 group_smt_balance handling


arch/x86/kernel/smpboot.c | 12 +++++++++---
kernel/sched/fair.c | 27 +++++++++++++++++++++++++--
2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d40ed3a7dc23..266d05e22ac3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}


-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -603,7 +602,14 @@ static int x86_cluster_flags(void)
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
-#endif
+
+static int x86_die_flags(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return x86_sched_itmt_flags();
+
+ return 0;
+}

/*
* Set if a package/die has multiple NUMA nodes inside.
@@ -640,7 +646,7 @@ static void __init build_sched_topology(void)
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
};
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..cb225921bbca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6619,6 +6619,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Working cpumask for: load_balance, load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);

#ifdef CONFIG_NO_HZ_COMMON

@@ -9579,7 +9580,7 @@ static inline long sibling_imbalance(struct lb_env *env,
imbalance /= ncores_local + ncores_busiest;

/* Take advantage of resource in an empty sched group */
- if (imbalance == 0 && local->sum_nr_running == 0 &&
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;

@@ -9767,6 +9768,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
break;

case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9806,6 +9816,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
else
return true;
}
+has_spare:

/*
* Select not overloaded group with lowest number of idle cpus
@@ -10917,6 +10928,7 @@ static int active_load_balance_cpu_stop(void *data);

static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
int cpu, idle_smt = -1;

@@ -10940,8 +10952,9 @@ static int should_we_balance(struct lb_env *env)
return 1;
}

+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;

@@ -10953,6 +10966,14 @@ static int should_we_balance(struct lb_env *env)
if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
continue;
}

@@ -12918,6 +12939,8 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));

#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));