[RFC PATCH 3/5] sched: Restructure nohz_balance_kick

From: Peter Zijlstra
Date: Thu Dec 21 2017 - 05:23:50 EST




Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 218 ++++++++++++++++++++++++++--------------------------
1 file changed, 111 insertions(+), 107 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8984,12 +8984,29 @@ static inline int find_new_ilb(void)
return nr_cpu_ids;
}

+static inline void set_cpu_sd_state_busy(void)
+{
+ struct sched_domain *sd;
+ int cpu = smp_processor_id();
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void kick_ilb(void)
{
unsigned int flags;
int ilb_cpu;
@@ -9004,6 +9021,7 @@ static void nohz_balancer_kick(void)
flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
+
/*
* Use smp_send_reschedule() instead of resched_cpu().
* This way we generate a sched IPI on the target cpu which
@@ -9011,7 +9029,94 @@ static void nohz_balancer_kick(void)
* will be run before returning from the IPI.
*/
smp_send_reschedule(ilb_cpu);
- return;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu in the system.
+ * - This rq has more than one task.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
+ */
+static void nohz_balancer_kick(struct rq *rq)
+{
+ unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
+ struct sched_domain *sd;
+ int nr_busy, i, cpu = rq->cpu;
+ bool kick = false;
+
+ if (unlikely(rq->idle_balance))
+ return;
+
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ set_cpu_sd_state_busy();
+ nohz_balance_exit_idle(cpu);
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return;
+
+ if (time_before(now, nohz.next_balance))
+ return;
+
+ if (rq->nr_running >= 2) {
+ kick = true;
+ goto out;
+ }
+
+ rcu_read_lock();
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@xxxxxxxxxxxxxxxxxxxxxxxxxx
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
+ }
+
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ if (sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (i == cpu ||
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+ continue;
+
+ if (sched_asym_prefer(i, cpu)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+out:
+ if (kick)
+ kick_ilb();
}

void nohz_balance_exit_idle(unsigned int cpu)
@@ -9031,23 +9136,6 @@ void nohz_balance_exit_idle(unsigned int
}
}

-static inline void set_cpu_sd_state_busy(void)
-{
- struct sched_domain *sd;
- int cpu = smp_processor_id();
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
-
- atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
@@ -9094,6 +9182,8 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(&nohz.nr_cpus);
atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
+#else
+static inline void nohz_balancer_kick(struct rq *rq) { }
#endif

static DEFINE_SPINLOCK(balancing);
@@ -9291,90 +9381,6 @@ static bool nohz_idle_balance(struct rq

return true;
}
-
-/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
- */
-static inline bool nohz_kick_needed(struct rq *rq)
-{
- unsigned long now = jiffies;
- struct sched_domain_shared *sds;
- struct sched_domain *sd;
- int nr_busy, i, cpu = rq->cpu;
- bool kick = false;
-
- if (unlikely(rq->idle_balance))
- return false;
-
- /*
- * We may be recently in ticked or tickless idle mode. At the first
- * busy tick after returning from idle, we will update the busy stats.
- */
- set_cpu_sd_state_busy();
- nohz_balance_exit_idle(cpu);
-
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
-
- if (time_before(now, nohz.next_balance))
- return false;
-
- if (rq->nr_running >= 2)
- return true;
-
- rcu_read_lock();
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
- /*
- * XXX: write a coherent comment on why we do this.
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@xxxxxxxxxxxxxxxxxxxxxxxxxx
- */
- nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- kick = true;
- goto unlock;
- }
-
- }
-
- sd = rcu_dereference(rq->sd);
- if (sd) {
- if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
- kick = true;
- goto unlock;
- }
- }
-
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
- if (sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (i == cpu ||
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
- continue;
-
- if (sched_asym_prefer(i, cpu)) {
- kick = true;
- goto unlock;
- }
- }
- }
-unlock:
- rcu_read_unlock();
- return kick;
-}
#else
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -9419,10 +9425,8 @@ void trigger_load_balance(struct rq *rq)

if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
-#endif
+
+ nohz_balancer_kick(rq);
}

static void rq_online_fair(struct rq *rq)