[RFC PATCH v9 1/2] sched: Fix: Handle target_cpu offlining in active_load_balance_cpu_stop

From: Mathieu Desnoyers
Date: Wed Apr 19 2023 - 11:51:31 EST


Handle scenario where the target cpu is going offline concurrently with
execution of active_load_balance_cpu_stop, which can cause
__sched_core_flip to flip rq->core_enabled without rq lock held, which
can trigger lockdep_assert_rq_held() warnings.

This scenario possibly has other unwanted effects such as migrating
tasks to offline cpus, which may prevent their execution for a long
time until the cpu is brought back online.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Aaron Lu <aaron.lu@xxxxxxxxx>
---
kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5f6587d94c1d..1c837ba41704 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8744,6 +8744,27 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
rq_unlock(rq, &rf);
}

+/*
+ * try_attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq if the rq is online. Returns false if the rq is not online.
+ */
+static bool try_attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ struct rq_flags rf;
+ bool result = true;
+
+ rq_lock(rq, &rf);
+ if (!rq->online) {
+ result = false;
+ goto unlock;
+ }
+ update_rq_clock(rq);
+ attach_task(rq, p);
+unlock:
+ rq_unlock(rq, &rf);
+ return result;
+}
+
/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
@@ -11048,8 +11069,17 @@ static int active_load_balance_cpu_stop(void *data)
busiest_rq->active_balance = 0;
rq_unlock(busiest_rq, &rf);

- if (p)
- attach_one_task(target_rq, p);
+ if (p) {
+ if (!try_attach_one_task(target_rq, p)) {
+ /*
+ * target_rq was offlined concurrently. There is no
+ * guarantee that the busiest cpu is still online at
+ * this point. Fallback on using the CPU on which the
+ * stopper thread is running as target.
+ */
+ attach_one_task(this_rq(), p);
+ }
+ }

local_irq_enable();

--
2.25.1