Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

From: Vincent Guittot
Date: Tue Dec 12 2023 - 06:06:51 EST


On Fri, 8 Dec 2023 at 02:52, Qais Yousef <qyousef@xxxxxxxxxxx> wrote:
>
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.
>
> Combine this with the rate limit in schedutil, we could end up
> prematurely send up a wrong frequency update before we have actually
> updated all entities appropriately.
>
> Be smarter about it by limiting the trigger to perform frequency updates
> after all accounting logic has done. This ended up being in the
> following points:
>
> 1. enqueue/dequeue_task_fair()
> 2. throttle/unthrottle_cfs_rq()
> 3. attach/detach_task_cfs_rq()
> 4. task_tick_fair()
> 5. __sched_group_set_shares()
>
> This is not 100% ideal still due to other limitations that might be
> a bit harder to handle. Namely we can end up with premature update
> request in the following situations:
>
> a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
> requires higher freq. The trigger to cpufreq_update_util() by the
> first task will lead to dropping the 2nd request until tick. Or
> another CPU in the same policy trigger a freq update.
>
> b. CPUs sharing a policy can end up with the same race in a but the
> simultaneous enqueue happens on different CPUs in the same policy.
>
> The above though are limitations in the governor/hardware, and from
> scheduler point of view at least that's the best we can do. The
> governor might consider smarter logic to aggregate near simultaneous
> request and honour the higher one.
>
> Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx>
> ---
> kernel/sched/fair.c | 55 ++++++++++++---------------------------------
> 1 file changed, 14 insertions(+), 41 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b83448be3f79..f99910fc6705 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3997,29 +3997,6 @@ static inline void update_cfs_group(struct sched_entity *se)
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
> -{
> - struct rq *rq = rq_of(cfs_rq);
> -
> - if (&rq->cfs == cfs_rq) {
> - /*
> - * There are a few boundary cases this might miss but it should
> - * get called often enough that that should (hopefully) not be
> - * a real problem.
> - *
> - * It will not get called when we go idle, because the idle
> - * thread is a different class (!fair), nor will the utilization
> - * number include things like RT tasks.
> - *
> - * As is, the util number is not freq-invariant (we'd have to
> - * implement arch_scale_freq_capacity() for that).
> - *
> - * See cpu_util_cfs().
> - */
> - cpufreq_update_util(rq, flags);
> - }
> -}
> -
> #ifdef CONFIG_SMP
> static inline bool load_avg_is_decayed(struct sched_avg *sa)
> {
> @@ -4648,8 +4625,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>
> add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
>
> - cfs_rq_util_change(cfs_rq, 0);
> -
> trace_pelt_cfs_tp(cfs_rq);
> }
>
> @@ -4678,8 +4653,6 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>
> add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
>
> - cfs_rq_util_change(cfs_rq, 0);
> -
> trace_pelt_cfs_tp(cfs_rq);
> }
>
> @@ -4726,11 +4699,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> */
> detach_entity_load_avg(cfs_rq, se);
> update_tg_load_avg(cfs_rq);
> - } else if (decayed) {
> - cfs_rq_util_change(cfs_rq, 0);
> -
> - if (flags & UPDATE_TG)
> - update_tg_load_avg(cfs_rq);
> + } else if (decayed && (flags & UPDATE_TG)) {
> + update_tg_load_avg(cfs_rq);
> }
> }
>
> @@ -5114,7 +5084,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>
> static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
> {
> - cfs_rq_util_change(cfs_rq, 0);
> }
>
> static inline void remove_entity_load_avg(struct sched_entity *se) {}
> @@ -5807,6 +5776,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> sub_nr_running(rq, task_delta);
>
> done:
> + cpufreq_update_util(rq, 0);
> +
> /*
> * Note: distribution will already see us throttled via the
> * throttled-list. rq->lock protects completion.
> @@ -5899,6 +5870,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> unthrottle_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> + cpufreq_update_util(rq, 0);
> +
> /* Determine whether we need to wake up potentially idle CPU: */
> if (rq->curr == rq->idle && rq->cfs.nr_running)
> resched_curr(rq);
> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> */
> util_est_enqueue(&rq->cfs, p);
>
> - /*
> - * If in_iowait is set, the code below may not trigger any cpufreq
> - * utilization updates, so do it here explicitly with the IOWAIT flag
> - * passed.
> - */
> - if (p->in_iowait)
> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> -
> for_each_sched_entity(se) {
> if (se->on_rq)
> break;
> @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>

Here and in the other places below, you lose :

- } else if (decayed) {

The decayed condition ensures a rate limit (~1ms) in the number of
calls to cpufreq_update_util.

enqueue/dequeue/tick don't create any sudden change in the PELT
signals that would require to update cpufreq of the change unlike
attach/detach


> + cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> +
> hrtick_update(rq);
> }
>
> @@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> + cpufreq_update_util(rq, 0);
> hrtick_update(rq);
> }
>
> @@ -8482,6 +8450,7 @@ done: __maybe_unused;
>
> update_misfit_status(p, rq);
> sched_fair_update_stop_tick(rq, p);
> + cpufreq_update_util(rq, 0);
>
> return p;
>
> @@ -12615,6 +12584,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>
> update_misfit_status(curr, rq);
> update_overutilized_status(task_rq(curr));
> + cpufreq_update_util(rq, 0);
>
> task_tick_core(rq, curr);
> }
> @@ -12739,6 +12709,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> detach_entity_cfs_rq(se);
> + cpufreq_update_util(task_rq(p), 0);
> }
>
> static void attach_task_cfs_rq(struct task_struct *p)
> @@ -12746,6 +12717,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> attach_entity_cfs_rq(se);
> + cpufreq_update_util(task_rq(p), 0);
> }
>
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> @@ -12991,6 +12963,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
> update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
> update_cfs_group(se);
> }
> + cpufreq_update_util(rq, 0);
> rq_unlock_irqrestore(rq, &rf);
> }
>
> --
> 2.34.1
>