[PATCH 1/3] sched/nohz: Update nohz.next_balance directly without IPIs (v2)

From: Joel Fernandes (Google)
Date: Thu Oct 19 2023 - 21:40:57 EST


Whenever a CPU stops its tick, it now requires another idle CPU to handle the
balancing for it because it can't perform its own periodic load balancing.
This means it might need to update 'nohz.next_balance' to 'rq->next_balance' if
the upcoming nohz-idle load balancing is too distant in the future. This update
process is done by triggering an ILB, as the general ILB handler
(_nohz_idle_balance) that manages regular nohz balancing also refreshes
'nohz.next_balance' by looking at the 'rq->next_balance' of all other idle CPUs
and selecting the smallest value.

Triggering this ILB is achieved in current mainline by setting the
NOHZ_NEXT_KICK flag. This primarily results in the ILB handler updating
'nohz.next_balance' while possibly not doing any load balancing at all.
However, sending an IPI merely to refresh 'nohz.next_balance' seems
excessive. This patch therefore directly sets nohz.next_balance from the
CPU stopping the tick.

Testing shows a considerable reduction in IPIs when doing this:

Running "cyclictest -i 100 -d 100 --latency=1000 -t -m" on a 4vcpu VM
the IPI call count profiled over 10s period is as follows:
without fix: ~10500
with fix: ~1000

Also just to note, without this patch we observe the following pattern:

1. A CPU is about to stop its tick.
2. It sets nohz.needs_update to 1.
3. It then stops its tick and goes idle.
4. The scheduler tick on another CPU checks this flag and decides an ILB kick
is needed.
5. The ILB CPU ends up being the one that just stopped its tick!
6. This results in an IPI to the tick-stopped CPU which ends up waking it up
and disturbing it!

Finally, this patch also avoids having to go through all the idle CPUs
just to update nohz.next_balance when the tick is stopped.

Previous version of patch had some issues which are addressed now:
https://lore.kernel.org/all/20231005161727.1855004-1-joel@xxxxxxxxxxxxxxxxx/

Cc: Suleiman Souhlal <suleiman@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxx>
Fixes: 7fd7a9e0caba ("sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle")
Co-developed-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx>
Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx>
Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 44 +++++++++++++++++++++++++++++---------------
kernel/sched/sched.h | 5 +----
2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb225921bbca..965c30fbbe5c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6627,7 +6627,6 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
- int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -11687,9 +11686,6 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
- if (READ_ONCE(nohz.needs_update))
- flags |= NOHZ_NEXT_KICK;
-
if (flags)
kick_ilb(flags);
}
@@ -11740,6 +11736,20 @@ static void set_cpu_sd_state_idle(int cpu)
rcu_read_unlock();
}

+static inline void
+update_nohz_next_balance(unsigned long next_balance)
+{
+ unsigned long nohz_next_balance;
+
+ /* In event of a race, only update with the earliest next_balance. */
+ do {
+ nohz_next_balance = READ_ONCE(nohz.next_balance);
+ if (!time_after(nohz_next_balance, next_balance))
+ break;
+ } while (!try_cmpxchg(&nohz.next_balance, &nohz_next_balance,
+ next_balance));
+}
+
/*
* This routine will record that the CPU is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
@@ -11786,13 +11796,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * and @needs_update stores.
+ * store.
*/
smp_mb__after_atomic();

set_cpu_sd_state_idle(cpu);

- WRITE_ONCE(nohz.needs_update, 1);
+ update_nohz_next_balance(rq->next_balance);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -11829,6 +11839,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
+ unsigned long old_nohz_next_balance;
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
@@ -11837,6 +11848,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)

SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

+ old_nohz_next_balance = READ_ONCE(nohz.next_balance);
+
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
@@ -11844,13 +11857,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
- *
- * Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0);
- if (flags & NOHZ_NEXT_KICK)
- WRITE_ONCE(nohz.needs_update, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -11874,8 +11883,6 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
if (need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
- if (flags & NOHZ_NEXT_KICK)
- WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}

@@ -11906,12 +11913,19 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
}

/*
- * next_balance will be updated only when there is a need.
+ * nohz.next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
+ *
+ * Also, if it changed since we scanned the nohz CPUs above, do nothing as:
+ * 1. A concurrent call to _nohz_idle_balance() moved nohz.next_balance forward.
+ * 2. nohz_balance_enter_idle moved it backward.
*/
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
+ if (likely(update_next_balance)) {
+ /* Pairs with the smp_mb() above. */
+ cmpxchg_release(&nohz.next_balance, old_nohz_next_balance,
+ next_balance);
+ }

if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.next_blocked,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..cf3597d91977 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2874,7 +2874,6 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
-#define NOHZ_NEXT_KICK_BIT 3

/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
@@ -2882,10 +2881,8 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
-/* Update nohz.next_balance */
-#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)

-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)

#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)

--
2.42.0.655.g421f12c284-goog