[RFC PATCH 1/2] sched/fair: Adjust CFS loadbalance for machine with qemu native CPU topology.

From: Kenan.Liu
Date: Thu Jul 20 2023 - 05:00:21 EST


From: "Kenan.Liu" <Kenan.Liu@xxxxxxxxxxxxxxxxx>

Multithreading workloads in VM with Qemu may encounter an unexpected
phenomenon: one hyperthread of a physical core is busy while its sibling
is idle. The main reason is that hyperthread index is consecutive in qemu
native x86 CPU model which is different from the physical topology. As the
current kernel scheduler implementation, hyperthread with an even ID
number will be picked up in a much higher probability during load-balancing
and load-deploying. To solve the imbalance, when on a machine with multi
core and hyperthread index is consecutive per core, change the result of
select_idle_core() according to the hyperthread on which the task ran
before.

Signed-off-by: Kenan.Liu <Kenan.Liu@xxxxxxxxxxxxxxxxx>
Signed-off-by: Ben Luo <luoben@xxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a80a739..ad7c93f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -125,6 +125,9 @@
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static bool smt_neighbour_topo;
+static bool core_smt_topo_detect;
+static unsigned int smt_nr_cpu = 2;

int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
@@ -140,6 +143,26 @@ static int __init setup_sched_thermal_decay_shift(char *str)
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);

#ifdef CONFIG_SMP
+static void explore_core_smp_topology(void)
+{
+ int cpu = smp_processor_id(), sibling;
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ if (nr_cpu_ids <= 2)
+ return;
+
+ smt_nr_cpu = cpumask_weight(smt_mask);
+ if (smt_nr_cpu < 2)
+ return;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+ if (abs(cpu - sibling) == 1)
+ smt_neighbour_topo = true;
+ }
+}
+
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
@@ -6887,9 +6910,16 @@ void __update_idle_core(struct rq *rq)
static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
bool idle = true;
- int cpu;
+ int cpu, sibling = core;
+
+ if (!core_smt_topo_detect) {
+ explore_core_smp_topology();
+ core_smt_topo_detect = true;
+ }

for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu != core)
+ sibling = cpu;
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
@@ -6905,8 +6935,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
*idle_cpu = cpu;
}

- if (idle)
+ if (idle) {
+ if (!smt_neighbour_topo || unlikely(core % smt_nr_cpu))
+ return core;
+ core = task_cpu(p) % smt_nr_cpu ? core : sibling;
return core;
+ }

cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
--
1.8.3.1