Re: [RFT] [PATCH v1] cpuidle: menu: Skip tick_nohz_get_sleep_length() call in some cases

From: Rafael J. Wysocki
Date: Wed Aug 09 2023 - 14:16:19 EST


On Wed, Aug 9, 2023 at 4:53 PM Rafael J. Wysocki <rjw@xxxxxxxxxxxxx> wrote:
>
> From: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
>
> Because the cost of calling tick_nohz_get_sleep_length() may increase
> in the future, reorder the code in menu_select() so it first uses the
> statistics to determine the expected idle duration. If that value is
> higher than RESIDENCY_THRESHOLD_NS, tick_nohz_get_sleep_length() will
> be called to obtain the time till the closest timer and refine the
> idle duration prediction if necessary.
>
> This causes the governor to always take the full overhead of
> get_typical_interval() with the assumption that the cost will be
> amortized by skipping the tick_nohz_get_sleep_length() call in the
> cases when the predicted idle duration is relatively very small.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
> ---
>
> This is a counterpart of the following two teo commits:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=06b80d45d3f94b1bbd3ee02bf3e007f55ed3e120
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=57c51179c203de17cf17e357d0e56c04f5e2494a
>
> and it is based on top of
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=pm-cpuidle-teo&id=9ebff7f2e77dc1cf7a143f6eee2852b6380096e5
>
> Later today I will expose a git branch with all of the recent cpuidle governor
> changes related to the tick_nohz_get_sleep_length() avoidance.
>
> ---
> drivers/cpuidle/governors/gov.h | 14 ++++++++
> drivers/cpuidle/governors/menu.c | 66 +++++++++++++++++++++------------------
> drivers/cpuidle/governors/teo.c | 2 +
> 3 files changed, 53 insertions(+), 29 deletions(-)
>
> Index: linux-pm/drivers/cpuidle/governors/menu.c
> ===================================================================
> --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> +++ linux-pm/drivers/cpuidle/governors/menu.c
> @@ -19,6 +19,8 @@
> #include <linux/sched/stat.h>
> #include <linux/math64.h>
>
> +#include "gov.h"
> +
> #define BUCKETS 12
> #define INTERVAL_SHIFT 3
> #define INTERVALS (1UL << INTERVAL_SHIFT)
> @@ -166,8 +168,7 @@ static void menu_update(struct cpuidle_d
> * of points is below a threshold. If it is... then use the
> * average of these 8 points as the estimated value.
> */
> -static unsigned int get_typical_interval(struct menu_device *data,
> - unsigned int predicted_us)
> +static unsigned int get_typical_interval(struct menu_device *data)
> {
> int i, divisor;
> unsigned int min, max, thresh, avg;
> @@ -195,13 +196,6 @@ again:
> }
> }
>
> - /*
> - * If the result of the computation is going to be discarded anyway,
> - * avoid the computation altogether.
> - */
> - if (min >= predicted_us)
> - return UINT_MAX;
> -
> if (divisor == INTERVALS)
> avg = sum >> INTERVAL_SHIFT;
> else
> @@ -267,7 +261,6 @@ static int menu_select(struct cpuidle_dr
> {
> struct menu_device *data = this_cpu_ptr(&menu_devices);
> s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
> - unsigned int predicted_us;
> u64 predicted_ns;
> u64 interactivity_req;
> unsigned int nr_iowaiters;
> @@ -279,16 +272,41 @@ static int menu_select(struct cpuidle_dr
> data->needs_update = 0;
> }
>
> - /* determine the expected residency time, round up */
> - delta = tick_nohz_get_sleep_length(&delta_tick);
> - if (unlikely(delta < 0)) {
> - delta = 0;
> - delta_tick = 0;
> - }
> - data->next_timer_ns = delta;
> -
> nr_iowaiters = nr_iowait_cpu(dev->cpu);
> - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
> +
> + /* Find the shortest expected idle interval. */
> + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
> + if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
> + unsigned int timer_us;
> +
> + /* Determine the time till the closest timer. */
> + delta = tick_nohz_get_sleep_length(&delta_tick);
> + if (unlikely(delta < 0)) {
> + delta = 0;
> + delta_tick = 0;
> + }
> +
> + data->next_timer_ns = delta;
> + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
> +
> + /* Round up the result for half microseconds. */
> + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
> + data->next_timer_ns *
> + data->correction_factor[data->bucket],
> + RESOLUTION * DECAY * NSEC_PER_USEC);
> + /* Use the lowest expected idle interval to pick the idle state. */
> + predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
> + } else {
> + /*
> + * Because the next timer event is not going to be determined
> + * in this case, assume that without the tick the closest timer
> + * will be in distant future and that the closest tick will occur
> + * after 1/2 of the tick period.
> + */
> + data->next_timer_ns = KTIME_MAX;
> + delta_tick = TICK_NSEC / 2;
> + data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
> + }
>
> if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
> ((data->next_timer_ns < drv->states[1].target_residency_ns ||
> @@ -303,16 +321,6 @@ static int menu_select(struct cpuidle_dr
> return 0;
> }
>
> - /* Round up the result for half microseconds. */
> - predicted_us = div_u64(data->next_timer_ns *
> - data->correction_factor[data->bucket] +
> - (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
> - RESOLUTION * DECAY * NSEC_PER_USEC);
> - /* Use the lowest expected idle interval to pick the idle state. */
> - predicted_ns = (u64)min(predicted_us,
> - get_typical_interval(data, predicted_us)) *
> - NSEC_PER_USEC;
> -
> if (tick_nohz_tick_stopped()) {
> /*
> * If the tick is already stopped, the cost of possible short
> Index: linux-pm/drivers/cpuidle/governors/teo.c
> ===================================================================
> --- linux-pm.orig/drivers/cpuidle/governors/teo.c
> +++ linux-pm/drivers/cpuidle/governors/teo.c
> @@ -140,6 +140,8 @@
> #include <linux/sched/topology.h>
> #include <linux/tick.h>
>
> +#include "gov.h"
> +
> /*
> * The number of bits to shift the CPU's capacity by in order to determine
> * the utilized threshold.
> Index: linux-pm/drivers/cpuidle/governors/gov.h
> ===================================================================
> --- /dev/null
> +++ linux-pm/drivers/cpuidle/governors/gov.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/* Common definitions for cpuidle governors. */
> +
> +#ifndef __CPUIDLE_GOVERNOR_H
> +#define __CPUIDLE_GOVERNOR_H
> +
> +/*
> + * Idle state target residency threshold used for deciding whether or not to
> + * check the time till the closest expected timer event.
> + */
> +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC)
> +
> +#endif /* __CPUIDLE_GOVERNOR_H */

This patch is now present in the git branch at

git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git \
pm-cpuidle-gov

along with the previous teo governor changes.