Re: [RESEND PATCH v2 1/2] sched/rt: add utilization tracking

From: Peter Zijlstra
Date: Mon Aug 07 2017 - 12:45:08 EST


On Fri, Aug 04, 2017 at 03:40:21PM +0200, Vincent Guittot wrote:

> There were several comments on v1:
> - As raised by Peter for v1, if IRQ time is taken into account in
> rt_avg, it will not be accounted in rq->clock_task. This means that cfs
> utilization is not affected by some extra contributions or decays
> because of IRQ.

Right.

> - Regading the sync of rt and cfs utilization, both cfs and rt use the same
> rq->clock_task. Then, we have the same issue than cfs regarding blocked value.
> The utilization of idle cfs/rt rqs are not updated regularly but only when a
> load_balance is triggered (more precisely a call to update_blocked_average).
> I'd like to fix this issue for both cfs and rt with a separate patch that
> will ensure that utilization (and load) are updated regularly even for
> idle CPUs

Yeah, that needs help.

> - One last open question is the location of rt utilization function in fair.c
> file. PELT related funtions should probably move in a dedicated pelt.c file.
> This would also help to address one comment about having a place to update
> metrics of NOHZ idle CPUs. Thought ?

Probably, but I have a bunch of patches lined up changing that code, so
lets not do that now.

In any case, would something like the attached patches make sense? It
completely replaces rt_avg with separate IRQ,RT and DL tracking.


Subject: sched: Add DL utilization tracking
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Mon Aug 7 16:49:48 CEST 2017

Track how much time we spend running DL tasks on avgerage.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/deadline.c | 11 ++++++++++-
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 24 insertions(+), 1 deletion(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1655,6 +1655,8 @@ static struct sched_dl_entity *pick_next
return rb_entry(left, struct sched_dl_entity, rb_node);
}

+extern int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running);
+
struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -1702,19 +1704,25 @@ pick_next_task_dl(struct rq *rq, struct
p->se.exec_start = rq_clock_task(rq);

/* Running task will never be pushed. */
- dequeue_pushable_dl_task(rq, p);
+ dequeue_pushable_dl_task(rq, p);

if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);

queue_push_tasks(rq);

+ if (p) {
+ update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), dl_rq,
+ rq->curr->sched_class == &dl_sched_class);
+ }
+
return p;
}

static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->dl, 1);

if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
@@ -1723,6 +1731,7 @@ static void put_prev_task_dl(struct rq *
static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->dl, 1);

/*
* Even when we have runtime, update_curr_dl() might have resulted in us
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3017,6 +3017,11 @@ int update_rt_rq_load_avg(u64 now, int c
return ___update_load_avg(now, cpu, &rt_rq->avg, 0, running, NULL);
}

+int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running)
+{
+ return ___update_load_avg(now, cpu, &dl_rq->avg, 0, running, NULL);
+}
+

/*
* Signed add and clamp on underflow.
@@ -3550,6 +3555,11 @@ int update_rt_rq_load_avg(u64 now, int c
return 0;
}

+int update_dl_rq_load_avg(u64 now, int cpu, struct dl_rq *dl_rq, int running)
+{
+ return 0;
+}
+
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0

@@ -6945,6 +6955,7 @@ static void update_blocked_averages(int
}

update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+ update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
rq_unlock_irqrestore(rq, &rf);
}

@@ -7005,6 +7016,7 @@ static inline void update_blocked_averag
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+ update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
rq_unlock_irqrestore(rq, &rf);
}

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -580,6 +580,8 @@ struct dl_rq {
*/
struct rb_root pushable_dl_tasks_root;
struct rb_node *pushable_dl_tasks_leftmost;
+
+ struct sched_avg avg;
#else
struct dl_bw dl_bw;
#endif
Subject: sched: Add IRQ utilization tracking
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Mon Aug 7 17:22:46 CEST 2017

Track how much time we spend on IRQ...

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 6 +++++-
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 19 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -151,6 +151,8 @@ struct rq *task_rq_lock(struct task_stru
}
}

+extern int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running);
+
/*
* RQ-clock updating methods:
*/
@@ -204,8 +206,10 @@ static void update_rq_clock_task(struct
rq->clock_task += delta;

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
sched_rt_avg_update(rq, irq_delta + steal);
+ update_irq_load_avg(rq->clock, cpu_of(rq), rq, 1);
+ }
#endif
}

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3012,6 +3012,11 @@ __update_load_avg_cfs_rq(u64 now, int cp
cfs_rq->curr != NULL, cfs_rq);
}

+int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running)
+{
+ return ___update_load_avg(now, cpu, &rq->irq_avg, 0, running, NULL);
+}
+
int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
{
return ___update_load_avg(now, cpu, &rt_rq->avg, 0, running, NULL);
@@ -3550,6 +3555,11 @@ update_cfs_rq_load_avg(u64 now, struct c
return 0;
}

+int update_irq_load_avg(u64 now, int cpu, struct rq *rq, int running)
+{
+ return 0;
+}
+
int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
{
return 0;
@@ -6926,6 +6936,7 @@ static void update_blocked_averages(int

rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
+ update_irq_load_avg(rq_clock(rq), cpu, rq, 0);

/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7014,6 +7025,7 @@ static inline void update_blocked_averag

rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
+ update_irq_load_avg(rq_clock(rq), cpu, rq, 0);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
update_dl_rq_load_avg(rq_clock_task(rq), cpu, &rq->dl, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -748,6 +748,8 @@ struct rq {

struct list_head cfs_tasks;

+ struct sched_avg irq_avg;
+
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
Subject: sched: Replace rt_avg
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Mon Aug 7 18:26:25 CEST 2017

Now that we have separate IRQ, RT and DL utilization tracking, we can
fully replace the old rt_avg code, reducing the amount of statistics.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched/sysctl.h | 1 -
kernel/sched/core.c | 40 ++--------------------------------------
kernel/sched/deadline.c | 2 --
kernel/sched/fair.c | 21 +++++----------------
kernel/sched/rt.c | 2 --
kernel/sched/sched.h | 26 +-------------------------
kernel/sysctl.c | 7 -------
7 files changed, 8 insertions(+), 91 deletions(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,7 +39,6 @@ extern unsigned int sysctl_numa_balancin
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;

int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -62,14 +62,6 @@ const_debug unsigned int sysctl_sched_fe
const_debug unsigned int sysctl_sched_nr_migrate = 32;

/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
-/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
@@ -161,7 +153,7 @@ static void update_rq_clock_task(struct
{
/*
* In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
+ * to update_irq_load_avg(). But I don't trust it...
*/
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
s64 steal = 0, irq_delta = 0;
@@ -206,10 +198,8 @@ static void update_rq_clock_task(struct
rq->clock_task += delta;

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
- sched_rt_avg_update(rq, irq_delta + steal);
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq->clock, cpu_of(rq), rq, 1);
- }
#endif
}

@@ -673,23 +663,6 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-
-void sched_avg_update(struct rq *rq)
-{
- s64 period = sched_avg_period();
-
- while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (rq->age_stamp));
- rq->age_stamp += period;
- rq->rt_avg /= 2;
- }
-}
-
#endif /* CONFIG_SMP */

#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -5515,13 +5488,6 @@ void set_rq_offline(struct rq *rq)
}
}

-static void set_cpu_rq_start_time(unsigned int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- rq->age_stamp = sched_clock_cpu(cpu);
-}
-
/*
* used to mark begin/end of suspend/resume:
*/
@@ -5640,7 +5606,6 @@ static void sched_rq_cpu_starting(unsign

int sched_cpu_starting(unsigned int cpu)
{
- set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
return 0;
}
@@ -5919,7 +5884,6 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1147,8 +1147,6 @@ static void update_curr_dl(struct rq *rq
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);

- sched_rt_avg_update(rq, delta_exec);
-
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
dl_se->runtime -= delta_exec;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5151,8 +5151,6 @@ static void cpu_load_update(struct rq *t

this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
-
- sched_avg_update(this_rq);
}

/* Used instead of source_load when we know the type == 0 */
@@ -7134,23 +7132,14 @@ static inline int get_sd_load_idx(struct
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, used, age_stamp, avg;
- s64 delta;
+ unsigned long used;

/*
- * Since we're reading these variables without serialization make sure
- * we read them once before doing sanity checks on them.
+ * Subtract IRQ, RT and DL time as all of those preempt CFS.
*/
- age_stamp = READ_ONCE(rq->age_stamp);
- avg = READ_ONCE(rq->rt_avg);
- delta = __rq_clock_broken(rq) - age_stamp;
-
- if (unlikely(delta < 0))
- delta = 0;
-
- total = sched_avg_period() + delta;
-
- used = div_u64(avg, total);
+ used = READ_ONCE(rq->irq_avg.util_avg) +
+ READ_ONCE(rq->rt.avg.util_avg) +
+ READ_ONCE(rq->dl.avg.util_avg);

if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -981,8 +981,6 @@ static void update_curr_rt(struct rq *rq
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);

- sched_rt_avg_update(rq, delta_exec);
-
if (!rt_bandwidth_enabled())
return;

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -750,8 +750,6 @@ struct rq {

struct sched_avg irq_avg;

- u64 rt_avg;
- u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;

@@ -843,11 +841,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)

-static inline u64 __rq_clock_broken(struct rq *rq)
-{
- return READ_ONCE(rq->clock);
-}
-
/*
* rq::clock_update_flags bits
*
@@ -1621,15 +1614,9 @@ extern void deactivate_task(struct rq *r

extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);

-extern const_debug unsigned int sysctl_sched_time_avg;
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;

-static inline u64 sched_avg_period(void)
-{
- return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-
#ifdef CONFIG_SCHED_HRTICK

/*
@@ -1658,8 +1645,6 @@ static inline int hrtick_enabled(struct
#endif /* CONFIG_SCHED_HRTICK */

#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
#ifndef arch_scale_freq_capacity
static __always_inline
unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
@@ -1678,16 +1663,7 @@ unsigned long arch_scale_cpu_capacity(st
return SCHED_CAPACITY_SCALE;
}
#endif
-
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
- sched_avg_update(rq);
-}
-#else
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
-#endif
+#endif /* CONFIG_SMP */

struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -362,13 +362,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "sched_time_avg_ms",
- .data = &sysctl_sched_time_avg,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",