[PATCH 2/4] sched: Allow a wakee to run on the prev_cpu if it is idle and cache-affine with the waker

From: Mel Gorman
Date: Mon Dec 18 2017 - 04:44:17 EST


With the commit "sched: Only migrate tasks due to interrupts if prev
and target CPUs share cache", we no longer migrate a task from interrupt
context if the waker does not share a CPU. However, for a normal wakeup
from a cache-affine process, we can miss the fact that prev_cpu is idle
and an appropriate sibling leading to unnecessary searches and migrations.

This patch reworks wake_affine to return a suitable CPU to wake on which
may be the current or prev CPU. If wake_affine_idle returns prev due to it
being idle then select_idle_sibling will immediately return the prev_cpu
without searching. It's slightly mixed on dbench using ext4 with gains when the machine is lightly
loaded and a small regression borderline on the noise when more than a node's
worth of CPU is used.

4.15.0-rc3 4.15.0-rc3
noirq wakeprev
Hmean 1 865.01 ( 0.00%) 834.19 ( -3.56%)
Hmean 2 1274.44 ( 0.00%) 1353.09 ( 6.17%)
Hmean 4 1628.08 ( 0.00%) 1714.82 ( 5.33%)
Hmean 8 1831.80 ( 0.00%) 1855.84 ( 1.31%)
Hmean 16 2091.44 ( 0.00%) 1975.40 ( -5.55%)
Hmean 32 2430.29 ( 0.00%) 2298.58 ( -5.42%)
Hmean 64 2568.54 ( 0.00%) 2536.56 ( -1.25%)
Hmean 128 2499.28 ( 0.00%) 2543.81 ( 1.78%)
Stddev 1 5.35 ( 0.00%) 19.39 (-262.63%)
Stddev 2 11.09 ( 0.00%) 4.88 ( 55.97%)
Stddev 4 6.80 ( 0.00%) 9.24 ( -35.93%)
Stddev 8 9.41 ( 0.00%) 28.39 (-201.82%)
Stddev 16 20.01 ( 0.00%) 44.92 (-124.56%)
Stddev 32 44.74 ( 0.00%) 50.14 ( -12.07%)
Stddev 64 93.18 ( 0.00%) 84.97 ( 8.81%)
Stddev 128 177.85 ( 0.00%) 178.00 ( -0.09%)

However, system CPU usage is noticably reduced

4.15.0-rc3 4.15.0-rc3
noirq wakeprev
User 1058.32 1077.42
System 5729.22 5287.61
Elapsed 1550.69 1553.09

Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 70 ++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a1f7d32ecf6..392e08b364bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5689,17 +5689,21 @@ static int wake_wide(struct task_struct *p)
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- * will be) idle.
+ * wake_affine_idle() - only considers 'now', it checks if a CPU that is
+ * cache-affine with the waker is idle
+ *
+ * wake_affine_sync() - only considers 'now', it checks if the waking CPU
+ * will be idle. Migrations to a different NUMA node
+ * are allowed on the basis that sync wakeups imply
+ * shared data between waker and wakee.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/

-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
/*
* If this_cpu is idle, it implies the wakeup is from interrupt
@@ -5710,13 +5714,36 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return true;

+ /*
+ * Prefer migration if it's an interrupt on the assumption that the
+ * data is cache hot to the CPU receiving the interrupt.
+ */
+ if (idle_cpu(this_cpu))
+ return this_cpu;
+
+ /*
+ * For normal wakeups, we use the prev_cpu if it's cache affine but
+ * for remote wakeups, rely on wake_affine_weight to determine if
+ * if it's best to pull the waker to the wakee. For sync wakeups,
+ * rely on wake_affine_sync to determine if the task should wakeup
+ * on the current CPU.
+ */
+ if (this_cpu != prev_cpu && !sync && idle_cpu(prev_cpu))
+ return prev_cpu;
+
+ return nr_cpumask_bits;
+}
+
+static int
+wake_affine_sync(int this_cpu, int sync)
+{
if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return true;
+ return this_cpu;

- return false;
+ return nr_cpumask_bits;
}

-static bool
+static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5730,7 +5757,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long current_load = task_h_load(current);

if (current_load > this_eff_load)
- return true;
+ return this_cpu;

this_eff_load -= current_load;
}
@@ -5747,28 +5774,34 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);

- return this_eff_load <= prev_eff_load;
+ if (this_eff_load <= prev_eff_load)
+ return this_cpu;
+ return nr_cpumask_bits;
}

static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ int new_cpu = nr_cpumask_bits;
+
+ if (sched_feat(WA_IDLE))
+ new_cpu = wake_affine_idle(this_cpu, prev_cpu, sync);

- if (sched_feat(WA_IDLE) && !affine)
- affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE) && new_cpu == nr_cpumask_bits)
+ new_cpu = wake_affine_sync(this_cpu, sync);

- if (sched_feat(WA_WEIGHT) && !affine)
- affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && new_cpu == nr_cpumask_bits)
+ new_cpu = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);

schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (affine) {
+ if (new_cpu != nr_cpumask_bits) {
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ return new_cpu;
}

- return affine;
+ return prev_cpu;
}

static inline int task_util(struct task_struct *p);
@@ -6361,8 +6394,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu)
goto pick_cpu;

- if (wake_affine(affine_sd, p, prev_cpu, sync))
- new_cpu = cpu;
+ new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
}

if (sd && !(sd_flag & SD_BALANCE_FORK)) {
--
2.15.0