[RFC][PATCH 2/3] cpuidle,teo: Improve NOHZ management

From: Peter Zijlstra
Date: Fri Jul 28 2023 - 11:01:17 EST


With cpuidle having added a TICK bucket, TEO will account all TICK and
longer idles there. This means we can now make an informed decision
about stopping the tick. If the sum of 'hit+intercepts' of all states
below the TICK bucket is more than 50%, it is most likely we'll not
reach the tick this time around either, so stopping the tick doesn't
make sense.

If we don't stop the tick, don't bother calling
tick_nohz_get_sleep_length() and assume duration is no longer than a
tick (could be improved to still look at the current pending time and
timers).

Since we have this extra state, remove the state_count based early
decisions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
drivers/cpuidle/governors/teo.c | 97 ++++++++++++++--------------------------
1 file changed, 34 insertions(+), 63 deletions(-)

--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -139,6 +139,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/topology.h>
#include <linux/tick.h>
+#include "../cpuidle.h"

/*
* The number of bits to shift the CPU's capacity by in order to determine
@@ -197,7 +198,6 @@ struct teo_cpu {
int next_recent_idx;
int recent_idx[NR_RECENT];
unsigned long util_threshold;
- bool utilized;
};

static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -276,11 +276,11 @@ static void teo_update(struct cpuidle_dr

cpu_data->total += bin->hits + bin->intercepts;

- if (target_residency_ns <= cpu_data->sleep_length_ns) {
+ if (target_residency_ns <= cpu_data->sleep_length_ns)
idx_timer = i;
- if (target_residency_ns <= measured_ns)
- idx_duration = i;
- }
+
+ if (target_residency_ns <= measured_ns)
+ idx_duration = i;
}

i = cpu_data->next_recent_idx++;
@@ -362,11 +362,12 @@ static int teo_select(struct cpuidle_dri
unsigned int recent_sum = 0;
unsigned int idx_hit_sum = 0;
unsigned int hit_sum = 0;
+ unsigned int tick_sum = 0;
int constraint_idx = 0;
int idx0 = 0, idx = -1;
bool alt_intercepts, alt_recent;
ktime_t delta_tick;
- s64 duration_ns;
+ s64 duration_ns = TICK_NSEC;
int i;

if (dev->last_state_idx >= 0) {
@@ -376,36 +377,26 @@ static int teo_select(struct cpuidle_dri

cpu_data->time_span_ns = local_clock();

- duration_ns = tick_nohz_get_sleep_length(&delta_tick);
- cpu_data->sleep_length_ns = duration_ns;
+ /* Should we stop the tick? */
+ for (i = 1; i < drv->state_count; i++) {
+ struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
+ struct cpuidle_state *s = &drv->states[i];

- /* Check if there is any choice in the first place. */
- if (drv->state_count < 2) {
- idx = 0;
- goto end;
- }
- if (!dev->states_usage[0].disable) {
- idx = 0;
- if (drv->states[1].target_residency_ns > duration_ns)
- goto end;
- }
+ tick_sum += prev_bin->intercepts;
+ tick_sum += prev_bin->hits;

- cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
- /*
- * If the CPU is being utilized over the threshold and there are only 2
- * states to choose from, the metrics need not be considered, so choose
- * the shallowest non-polling state and exit.
- */
- if (drv->state_count < 3 && cpu_data->utilized) {
- for (i = 0; i < drv->state_count; ++i) {
- if (!dev->states_usage[i].disable &&
- !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) {
- idx = i;
- goto end;
- }
- }
+ if (s->target_residency_ns >= SHORT_TICK_NSEC)
+ break;
}

+ if (2*tick_sum > cpu_data->total)
+ *stop_tick = false;
+
+ /* If we do stop the tick, ask for the next timer. */
+ if (*stop_tick)
+ duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+ cpu_data->sleep_length_ns = duration_ns;
+
/*
* Find the deepest idle state whose target residency does not exceed
* the current sleep length and the deepest idle state not deeper than
@@ -446,13 +437,13 @@ static int teo_select(struct cpuidle_dri
idx_recent_sum = recent_sum;
}

- /* Avoid unnecessary overhead. */
- if (idx < 0) {
- idx = 0; /* No states enabled, must use 0. */
- goto end;
- } else if (idx == idx0) {
- goto end;
- }
+ /* No states enabled, must use 0 */
+ if (idx < 0)
+ return 0;
+
+ /* No point looking for something shallower than the first enabled state */
+ if (idx == idx0)
+ return idx;

/*
* If the sum of the intercepts metric for all of the idle states
@@ -541,29 +532,9 @@ static int teo_select(struct cpuidle_dri
* If the CPU is being utilized over the threshold, choose a shallower
* non-polling state to improve latency
*/
- if (cpu_data->utilized)
+ if (teo_cpu_is_utilized(dev->cpu, cpu_data))
idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true);

-end:
- /*
- * Don't stop the tick if the selected state is a polling one or if the
- * expected idle duration is shorter than the tick period length.
- */
- if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
- duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
- *stop_tick = false;
-
- /*
- * The tick is not going to be stopped, so if the target
- * residency of the state to be returned is not within the time
- * till the closest timer including the tick, try to correct
- * that.
- */
- if (idx > idx0 &&
- drv->states[idx].target_residency_ns > delta_tick)
- idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
- }
-
return idx;
}