[RFC PATCH 2/3] sched/fair: Improve integration of SHARED_RUNQ feature within newidle_balance

From: K Prateek Nayak
Date: Thu Aug 31 2023 - 06:47:18 EST


This patch takes the relevant optimizations from [1] in
newidle_balance(). Following is the breakdown:

- Check "rq->rd->overload" before jumping into newidle_balance, even
with SHARED_RQ feat enabled.

- Call update_next_balance() for all the domains till MC Domain in
when SHARED_RQ path is taken.

- Account cost from shared_runq_pick_next_task() and update
curr_cost and sd->max_newidle_lb_cost accordingly.

- Move the initial rq_unpin_lock() logic around. Also, the caller of
shared_runq_pick_next_task() is responsible for calling
rq_repin_lock() if the return value is non zero. (Needs to be verified
everything is right with LOCKDEP)

- Includes a fix to skip directly above the LLC domain when calling the
load_balance() in newidle_balance()

All other surgery from [1] has been removed.

Link: https://lore.kernel.org/all/31aeb639-1d66-2d12-1673-c19fed0ab33a@xxxxxxx/ [1]
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/fair.c | 94 ++++++++++++++++++++++++++++++++-------------
1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf844ffa79c2..446ffdad49e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -337,7 +337,6 @@ static int shared_runq_pick_next_task(struct rq *rq, struct rq_flags *rf)
rq_unpin_lock(rq, &src_rf);
raw_spin_unlock_irqrestore(&p->pi_lock, src_rf.flags);
}
- rq_repin_lock(rq, rf);

return ret;
}
@@ -12276,50 +12275,83 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (!cpu_active(this_cpu))
return 0;

- if (sched_feat(SHARED_RUNQ)) {
- pulled_task = shared_runq_pick_next_task(this_rq, rf);
- if (pulled_task)
- return pulled_task;
- }
-
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);

- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- rq_unpin_lock(this_rq, rf);
-
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
-
- /*
- * Skip <= LLC domains as they likely won't have any tasks if the
- * shared runq is empty.
- */
- if (sched_feat(SHARED_RUNQ)) {
+ if (sched_feat(SHARED_RUNQ))
sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (likely(sd))
- sd = sd->parent;
- }
+ else
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);

if (!READ_ONCE(this_rq->rd->overload) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ /* Look at rq->avg_idle iff SHARED_RUNQ is disabled */
+ (!sched_feat(SHARED_RUNQ) && sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {

- if (sd)
+ while (sd) {
update_next_balance(sd, &next_balance);
+ sd = sd->child;
+ }
+
rcu_read_unlock();

goto out;
}
+
+ if (sched_feat(SHARED_RUNQ)) {
+ struct sched_domain *tmp = sd;
+
+ t0 = sched_clock_cpu(this_cpu);
+
+ /* Do update_next_balance() for all domains within LLC */
+ while (tmp) {
+ update_next_balance(tmp, &next_balance);
+ tmp = tmp->child;
+ }
+
+ pulled_task = shared_runq_pick_next_task(this_rq, rf);
+ if (pulled_task) {
+ if (sd) {
+ curr_cost = sched_clock_cpu(this_cpu) - t0;
+ /*
+ * Will help bail out of scans of higer domains
+ * slightly earlier.
+ */
+ update_newidle_cost(sd, curr_cost);
+ }
+
+ rcu_read_unlock();
+ goto out_swq;
+ }
+
+ if (sd) {
+ t1 = sched_clock_cpu(this_cpu);
+ curr_cost += t1 - t0;
+ update_newidle_cost(sd, curr_cost);
+ }
+
+ /*
+ * Since shared_runq_pick_next_task() can take a while
+ * check if the CPU was targetted for a wakeup in the
+ * meantime.
+ */
+ if (this_rq->ttwu_pending) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
rcu_read_unlock();

+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
raw_spin_rq_unlock(this_rq);

t0 = sched_clock_cpu(this_cpu);
@@ -12335,6 +12367,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
break;

+ /*
+ * Skip <= LLC domains as they likely won't have any tasks if the
+ * shared runq is empty.
+ */
+ if (sched_feat(SHARED_RUNQ) && (sd->flags & SD_SHARE_PKG_RESOURCES))
+ continue;
+
if (sd->flags & SD_BALANCE_NEWIDLE) {

pulled_task = load_balance(this_cpu, this_rq,
@@ -12361,6 +12400,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

raw_spin_rq_lock(this_rq);

+out_swq:
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;

--
2.34.1