Re: Stopping the tick on a fully loaded system

From: Peter Zijlstra
Date: Wed Jul 26 2023 - 15:55:25 EST


On Wed, Jul 26, 2023 at 06:14:32PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 26, 2023 at 05:53:46PM +0200, Rafael J. Wysocki wrote:
>
> > > > That means we don't track nearly enough data to reliably tell anything
> > > > about disabling the tick or not. We should have at least one bucket
> > > > beyond TICK_NSEC for this.
> > >
> > > Quite likely.
> >
> > So the reasoning here was that those additional bins would not be
> > necessary for idle state selection, but the problem of whether or not
> > to stop the tick is kind of separate from the idle state selection
> > problem if the target residency values for all of the idle states are
> > relatively short. And so it should be addressed separately which
> > currently it is not. Admittedly, this is a mistake.
>
> Right, the C state buckets are enough to pick a state, but not to handle
> the tick thing.
>
> The below hack boots on my ivb-ep with extra (disabled) states. Now let
> me go hack up teo to make use of that.
>
> name residency
>
> POLL 0
> C1 1
> C1E 80
> C3 156
> C6 300
> TICK 1000
> POST-TICK 2000
>

builds and boots, futher untested -- I need to see to dinner.

The idea is that teo_update() should account to the highest state if
measured_ns is 'large'.

Then, then the tick is on, see if the majority (50%) of the
hit+intercepts are below the TICK threshold, if so, don't stop the tick
and assume duration_ns = TICK_NSEC -- iow. don't call
tick_nohz_get_sleep_length().

I'll check if the below code actually does as intended once the loonies
are in bed.


---

diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index 987fc5f3997d..362470c8c273 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -197,7 +197,6 @@ struct teo_cpu {
int next_recent_idx;
int recent_idx[NR_RECENT];
unsigned long util_threshold;
- bool utilized;
};

static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -227,7 +226,7 @@ static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
- int i, idx_timer = 0, idx_duration = 0;
+ int i, idx_timer = 0, idx_duration = drv->state_count-1;
u64 measured_ns;

if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
@@ -362,11 +361,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
unsigned int recent_sum = 0;
unsigned int idx_hit_sum = 0;
unsigned int hit_sum = 0;
+ unsigned int tick_sum = 0;
int constraint_idx = 0;
int idx0 = 0, idx = -1;
bool alt_intercepts, alt_recent;
ktime_t delta_tick;
- s64 duration_ns;
+ s64 duration_ns = TICK_NSEC;
int i;

if (dev->last_state_idx >= 0) {
@@ -376,36 +376,26 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,

cpu_data->time_span_ns = local_clock();

- duration_ns = tick_nohz_get_sleep_length(&delta_tick);
- cpu_data->sleep_length_ns = duration_ns;
+ if (!tick_nohz_tick_stopped()) {
+ for (i = 1; i < drv->state_count; i++) {
+ struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
+ struct cpuidle_state *s = &drv->states[i];

- /* Check if there is any choice in the first place. */
- if (drv->state_count < 2) {
- idx = 0;
- goto end;
- }
- if (!dev->states_usage[0].disable) {
- idx = 0;
- if (drv->states[1].target_residency_ns > duration_ns)
- goto end;
- }
+ tick_sum += prev_bin->intercepts;
+ tick_sum += prev_bin->hits;

- cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
- /*
- * If the CPU is being utilized over the threshold and there are only 2
- * states to choose from, the metrics need not be considered, so choose
- * the shallowest non-polling state and exit.
- */
- if (drv->state_count < 3 && cpu_data->utilized) {
- for (i = 0; i < drv->state_count; ++i) {
- if (!dev->states_usage[i].disable &&
- !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) {
- idx = i;
- goto end;
- }
+ if (s->target_residency_ns >= TICK_NSEC)
+ break;
}
+
+ if (2*tick_sum > cpu_data->total)
+ *stop_tick = false;
}

+ if (*stop_tick)
+ duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+ cpu_data->sleep_length_ns = duration_ns;
+
/*
* Find the deepest idle state whose target residency does not exceed
* the current sleep length and the deepest idle state not deeper than
@@ -541,7 +531,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* If the CPU is being utilized over the threshold, choose a shallower
* non-polling state to improve latency
*/
- if (cpu_data->utilized)
+ if (teo_cpu_is_utilized(dev->cpu, cpu_data))
idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true);

end:
@@ -549,8 +539,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* Don't stop the tick if the selected state is a polling one or if the
* expected idle duration is shorter than the tick period length.
*/
- if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
- duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
+ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || !*stop_tick) {
*stop_tick = false;

/*