Re: [PATCH 1/2] sched: force update of blocked load of idle cpus

From: Vincent Guittot
Date: Mon Nov 20 2017 - 04:05:11 EST


On 24 October 2017 at 14:25, Brendan Jackman <brendan.jackman@xxxxxxx> wrote:
> From: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
>
> When idle, the blocked load of CPUs will be updated only when an idle
> load balance is triggered which may never happen. Because of this
> uncertainty on the execution of idle load balance, the utilization,
> the load and the shares of idle cfs_rq can stay artificially high and
> steal shares and running time to busy cfs_rqs of the task group.
> Add a new light idle load balance state which ensures that blocked loads
> are periodically updated and decayed but does not perform any task
> migration.
>
> The remote load udpates are rate-limited, so that they are not
> performed with a shorter period than LOAD_AVG_PERIOD (i.e. PELT
> half-life). This is the period after which we have a known 50% error
> in stale load.
>
> Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
> Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> [Switched remote update interval to use PELT half life]
> [Moved update_blocked_averges call outside rebalance_domains
> to simplify code]
> Signed-off-by: Brendan Jackman <brendan.jackman@xxxxxxx>
> ---
> kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++++-------
> kernel/sched/sched.h | 1 +
> 2 files changed, 63 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 85d1ec1c3b39..9085caf49c76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5976,6 +5976,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
> return min_cap * 1024 < task_util(p) * capacity_margin;
> }
>
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
> +static void nohz_balancer_kick(bool only_update);
> +
> /*
> * select_task_rq_fair: Select target runqueue for the waking task in domains
> * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -6074,6 +6077,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
> }
> rcu_read_unlock();
>
> +#ifdef CONFIG_NO_HZ_COMMON
> + if (nohz_kick_needed(cpu_rq(new_cpu), true))
> + nohz_balancer_kick(true);
> +#endif
> +
> return new_cpu;
> }
>
> @@ -8653,6 +8661,7 @@ static struct {
> cpumask_var_t idle_cpus_mask;
> atomic_t nr_cpus;
> unsigned long next_balance; /* in jiffy units */
> + unsigned long next_update; /* in jiffy units */
> } nohz ____cacheline_aligned;
>
> static inline int find_new_ilb(void)
> @@ -8670,7 +8679,7 @@ static inline int find_new_ilb(void)
> * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
> * CPU (if there is one).
> */
> -static void nohz_balancer_kick(void)
> +static void nohz_balancer_kick(bool only_update)
> {
> int ilb_cpu;
>
> @@ -8683,6 +8692,10 @@ static void nohz_balancer_kick(void)
>
> if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
> return;
> +
> + if (only_update)
> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> +
> /*
> * Use smp_send_reschedule() instead of resched_cpu().
> * This way we generate a sched IPI on the target cpu which
> @@ -8770,6 +8783,8 @@ void nohz_balance_enter_idle(int cpu)
> atomic_inc(&nohz.nr_cpus);
> set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
> }
> +#else
> +static inline void nohz_balancer_kick(bool only_update) {}
> #endif
>
> static DEFINE_SPINLOCK(balancing);
> @@ -8801,8 +8816,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
> int need_serialize, need_decay = 0;
> u64 max_cost = 0;
>
> - update_blocked_averages(cpu);
> -
> rcu_read_lock();
> for_each_domain(cpu, sd) {
> /*
> @@ -8901,6 +8914,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> {
> int this_cpu = this_rq->cpu;
> struct rq *rq;
> + struct sched_domain *sd;
> int balance_cpu;
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> @@ -8910,6 +8924,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
> goto end;
>
> + /*
> + * This cpu is going to update the blocked load of idle CPUs either
> + * before doing a rebalancing or just to keep metrics up to date. we
> + * can safely update the next update timestamp
> + */
> + rcu_read_lock();
> + sd = rcu_dereference(this_rq->sd);
> + /*
> + * Check whether there is a sched_domain available for this cpu.
> + * The last other cpu can have been unplugged since the ILB has been
> + * triggered and the sched_domain can now be null. The idle balance
> + * sequence will quickly be aborted as there is no more idle CPUs
> + */
> + if (sd)
> + nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> + rcu_read_unlock();
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -8936,7 +8967,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - rebalance_domains(rq, CPU_IDLE);
> + update_blocked_averages(balance_cpu);
> + /*
> + * This idle load balance softirq may have been
> + * triggered only to update the blocked load and shares
> + * of idle CPUs (which we have just done for
> + * balance_cpu). In that case skip the actual balance.
> + */
> + if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))
> + rebalance_domains(rq, idle);
> }
>
> if (time_after(next_balance, rq->next_balance)) {
> @@ -8967,7 +9006,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
> * domain span are idle.
> */
> -static inline bool nohz_kick_needed(struct rq *rq)
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update)
> {
> unsigned long now = jiffies;
> struct sched_domain_shared *sds;
> @@ -8975,7 +9014,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
> int nr_busy, i, cpu = rq->cpu;
> bool kick = false;
>
> - if (unlikely(rq->idle_balance))
> + if (unlikely(rq->idle_balance) && !only_update)
> return false;
>
> /*
> @@ -8992,6 +9031,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
> if (likely(!atomic_read(&nohz.nr_cpus)))
> return false;
>
> + if (only_update) {
> + if (time_before(now, nohz.next_update))
> + return false;
> + else
> + return true;
> + }
> +
> if (time_before(now, nohz.next_balance))
> return false;
>
> @@ -9041,6 +9087,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
> }
> #else
> static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { return false; }
> #endif
>
> /*
> @@ -9062,7 +9109,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
> * and abort nohz_idle_balance altogether if we pull some load.
> */
> nohz_idle_balance(this_rq, idle);
> - rebalance_domains(this_rq, idle);
> + update_blocked_averages(this_rq->cpu);
> + if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))

The NOHZ_STATS_KICK field is only defined with CONFIG_NO_HZ_COMMON.

> + rebalance_domains(this_rq, idle);
> +#ifdef CONFIG_NO_HZ_COMMON
> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
> +#endif
> }
>
> /*
> @@ -9077,8 +9129,8 @@ void trigger_load_balance(struct rq *rq)
> if (time_after_eq(jiffies, rq->next_balance))
> raise_softirq(SCHED_SOFTIRQ);
> #ifdef CONFIG_NO_HZ_COMMON
> - if (nohz_kick_needed(rq))
> - nohz_balancer_kick();
> + if (nohz_kick_needed(rq, false))
> + nohz_balancer_kick(false);
> #endif
> }
>
> @@ -9657,6 +9709,7 @@ __init void init_sched_fair_class(void)
>
> #ifdef CONFIG_NO_HZ_COMMON
> nohz.next_balance = jiffies;
> + nohz.next_update = jiffies;
> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> #endif
> #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 14db76cd496f..6f95ef653f73 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1978,6 +1978,7 @@ extern void cfs_bandwidth_usage_dec(void);
> enum rq_nohz_flag_bits {
> NOHZ_TICK_STOPPED,
> NOHZ_BALANCE_KICK,
> + NOHZ_STATS_KICK
> };
>
> #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
> --
> 2.14.1
>