Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated file

From: Quentin Perret
Date: Tue Dec 12 2017 - 07:35:35 EST


On Tuesday 12 Dec 2017 at 12:21:39 (+0100), Vincent Guittot wrote:
> Hi Quentin,
>
> On 11 December 2017 at 15:08, Quentin Perret <quentin.perret@xxxxxxx> wrote:
> > Hi Vincent,
> >
> > Although I agree that moving the PELT code in a dedicated file is
> > probably the cleanest way to achieve what you want, I was wondering if
> > you were able no measure any overhead due to moving the __update_load_avg_*()
> > functions in a different translation unit ? This is introducing function calls
> > in latency sensitive paths (wakeup for ex) so I'm just wondering what's
> > the impact in practice.
>
> To answer your question, I haven't seen any performance difference
> with perf bench sched pipe on a hikey board with the patchset
> Then, where do you think that this will explicitly introduce function
> call ?

For example, without the patch, __update_load_avg_blocked_se() was static
and the compiler had the opportunity to inline it where appropriate. With
the patch, the function is now in a different translation unit so I think
an actual function call is always needed. I'm not saying it's a big deal,
I was just wondering if that might have a noticeable impact.

The only alternative I see is to put the rt rq load update functions in
fair.c but that doesn't quiet feel like the right thing to do anyway ...

> It's not obvious for me that the patch that moves pelt code,
> will generate such changes. size vmlinux gives me the exact same
> results with and without the patch that moves pelt code.
> size vmlinux of latest tip/sched/core sha1 a555e9d86ee3
> $ size vmlinux
> text data bss dec hex filename
> 10677259 5931388 402104 17010751 103903f vmlinux_orig
>
> size vmlinux with patch 1 rebased on latest tip/sched/core
> $ size vmlinux
> text data bss dec hex filename
> 10677259 5931388 402104 17010751 103903f vmlinux_patch_rt
>
> >
> > Thanks,
> > Quentin
> >
> > On Wednesday 22 Nov 2017 at 15:35:53 (+0100), Vincent Guittot wrote:
> >> We want to track rt_rq's utilization as a part of the estimation of the
> >> whole rq's utilization. This is necessary because rt tasks can steal
> >> utilization to cfs tasks and make them lighter than they are.
> >> As we want to use the same load tracking mecanism for both and prevent
> >> useless dependency between cfs and rt code, pelt code is moved in a
> >> dedicated file.
> >>
> >> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> >> ---
> >> kernel/sched/Makefile | 2 +-
> >> kernel/sched/fair.c | 308 +-------------------------------------------------
> >> kernel/sched/pelt.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >> kernel/sched/pelt.h | 17 +++
> >> kernel/sched/sched.h | 20 ++++
> >> 5 files changed, 347 insertions(+), 308 deletions(-)
> >> create mode 100644 kernel/sched/pelt.c
> >> create mode 100644 kernel/sched/pelt.h
> >>
> >> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> >> index e2f9d4f..5a6d1c1 100644
> >> --- a/kernel/sched/Makefile
> >> +++ b/kernel/sched/Makefile
> >> @@ -19,7 +19,7 @@ endif
> >> obj-y += core.o loadavg.o clock.o cputime.o
> >> obj-y += idle_task.o fair.o rt.o deadline.o
> >> obj-y += wait.o wait_bit.o swait.o completion.o idle.o
> >> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
> >> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
> >> obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
> >> obj-$(CONFIG_SCHEDSTATS) += stats.o
> >> obj-$(CONFIG_SCHED_DEBUG) += debug.o
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 0989676..b88550e 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -270,9 +270,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> >> return cfs_rq->rq;
> >> }
> >>
> >> -/* An entity is a task if it doesn't "own" a runqueue */
> >> -#define entity_is_task(se) (!se->my_q)
> >> -
> >> static inline struct task_struct *task_of(struct sched_entity *se)
> >> {
> >> SCHED_WARN_ON(!entity_is_task(se));
> >> @@ -434,7 +431,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> >> return container_of(cfs_rq, struct rq, cfs);
> >> }
> >>
> >> -#define entity_is_task(se) 1
> >>
> >> #define for_each_sched_entity(se) \
> >> for (; se; se = NULL)
> >> @@ -707,7 +703,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> }
> >>
> >> #ifdef CONFIG_SMP
> >> -
> >> +#include "pelt.h"
> >> #include "sched-pelt.h"
> >>
> >> static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
> >> @@ -2723,19 +2719,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> } while (0)
> >>
> >> #ifdef CONFIG_SMP
> >> -/*
> >> - * XXX we want to get rid of these helpers and use the full load resolution.
> >> - */
> >> -static inline long se_weight(struct sched_entity *se)
> >> -{
> >> - return scale_load_down(se->load.weight);
> >> -}
> >> -
> >> -static inline long se_runnable(struct sched_entity *se)
> >> -{
> >> - return scale_load_down(se->runnable_weight);
> >> -}
> >> -
> >> static inline void
> >> enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> {
> >> @@ -3038,289 +3021,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
> >> }
> >>
> >> #ifdef CONFIG_SMP
> >> -/*
> >> - * Approximate:
> >> - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> >> - */
> >> -static u64 decay_load(u64 val, u64 n)
> >> -{
> >> - unsigned int local_n;
> >> -
> >> - if (unlikely(n > LOAD_AVG_PERIOD * 63))
> >> - return 0;
> >> -
> >> - /* after bounds checking we can collapse to 32-bit */
> >> - local_n = n;
> >> -
> >> - /*
> >> - * As y^PERIOD = 1/2, we can combine
> >> - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> >> - * With a look-up table which covers y^n (n<PERIOD)
> >> - *
> >> - * To achieve constant time decay_load.
> >> - */
> >> - if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> >> - val >>= local_n / LOAD_AVG_PERIOD;
> >> - local_n %= LOAD_AVG_PERIOD;
> >> - }
> >> -
> >> - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> >> - return val;
> >> -}
> >> -
> >> -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> >> -{
> >> - u32 c1, c2, c3 = d3; /* y^0 == 1 */
> >> -
> >> - /*
> >> - * c1 = d1 y^p
> >> - */
> >> - c1 = decay_load((u64)d1, periods);
> >> -
> >> - /*
> >> - * p-1
> >> - * c2 = 1024 \Sum y^n
> >> - * n=1
> >> - *
> >> - * inf inf
> >> - * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> >> - * n=0 n=p
> >> - */
> >> - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> >> -
> >> - return c1 + c2 + c3;
> >> -}
> >> -
> >> -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> >> -
> >> -/*
> >> - * Accumulate the three separate parts of the sum; d1 the remainder
> >> - * of the last (incomplete) period, d2 the span of full periods and d3
> >> - * the remainder of the (incomplete) current period.
> >> - *
> >> - * d1 d2 d3
> >> - * ^ ^ ^
> >> - * | | |
> >> - * |<->|<----------------->|<--->|
> >> - * ... |---x---|------| ... |------|-----x (now)
> >> - *
> >> - * p-1
> >> - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> >> - * n=1
> >> - *
> >> - * = u y^p + (Step 1)
> >> - *
> >> - * p-1
> >> - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
> >> - * n=1
> >> - */
> >> -static __always_inline u32
> >> -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> >> - unsigned long load, unsigned long runnable, int running)
> >> -{
> >> - unsigned long scale_freq, scale_cpu;
> >> - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> >> - u64 periods;
> >> -
> >> - scale_freq = arch_scale_freq_capacity(NULL, cpu);
> >> - scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> >> -
> >> - delta += sa->period_contrib;
> >> - periods = delta / 1024; /* A period is 1024us (~1ms) */
> >> -
> >> - /*
> >> - * Step 1: decay old *_sum if we crossed period boundaries.
> >> - */
> >> - if (periods) {
> >> - sa->load_sum = decay_load(sa->load_sum, periods);
> >> - sa->runnable_load_sum =
> >> - decay_load(sa->runnable_load_sum, periods);
> >> - sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> >> -
> >> - /*
> >> - * Step 2
> >> - */
> >> - delta %= 1024;
> >> - contrib = __accumulate_pelt_segments(periods,
> >> - 1024 - sa->period_contrib, delta);
> >> - }
> >> - sa->period_contrib = delta;
> >> -
> >> - contrib = cap_scale(contrib, scale_freq);
> >> - if (load)
> >> - sa->load_sum += load * contrib;
> >> - if (runnable)
> >> - sa->runnable_load_sum += runnable * contrib;
> >> - if (running)
> >> - sa->util_sum += contrib * scale_cpu;
> >> -
> >> - return periods;
> >> -}
> >> -
> >> -/*
> >> - * We can represent the historical contribution to runnable average as the
> >> - * coefficients of a geometric series. To do this we sub-divide our runnable
> >> - * history into segments of approximately 1ms (1024us); label the segment that
> >> - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> >> - *
> >> - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> >> - * p0 p1 p2
> >> - * (now) (~1ms ago) (~2ms ago)
> >> - *
> >> - * Let u_i denote the fraction of p_i that the entity was runnable.
> >> - *
> >> - * We then designate the fractions u_i as our co-efficients, yielding the
> >> - * following representation of historical load:
> >> - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> >> - *
> >> - * We choose y based on the with of a reasonably scheduling period, fixing:
> >> - * y^32 = 0.5
> >> - *
> >> - * This means that the contribution to load ~32ms ago (u_32) will be weighted
> >> - * approximately half as much as the contribution to load within the last ms
> >> - * (u_0).
> >> - *
> >> - * When a period "rolls over" and we have new u_0`, multiplying the previous
> >> - * sum again by y is sufficient to update:
> >> - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> >> - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> >> - */
> >> -static __always_inline int
> >> -___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> >> - unsigned long load, unsigned long runnable, int running)
> >> -{
> >> - u64 delta;
> >> -
> >> - delta = now - sa->last_update_time;
> >> - /*
> >> - * This should only happen when time goes backwards, which it
> >> - * unfortunately does during sched clock init when we swap over to TSC.
> >> - */
> >> - if ((s64)delta < 0) {
> >> - sa->last_update_time = now;
> >> - return 0;
> >> - }
> >> -
> >> - /*
> >> - * Use 1024ns as the unit of measurement since it's a reasonable
> >> - * approximation of 1us and fast to compute.
> >> - */
> >> - delta >>= 10;
> >> - if (!delta)
> >> - return 0;
> >> -
> >> - sa->last_update_time += delta << 10;
> >> -
> >> - /*
> >> - * running is a subset of runnable (weight) so running can't be set if
> >> - * runnable is clear. But there are some corner cases where the current
> >> - * se has been already dequeued but cfs_rq->curr still points to it.
> >> - * This means that weight will be 0 but not running for a sched_entity
> >> - * but also for a cfs_rq if the latter becomes idle. As an example,
> >> - * this happens during idle_balance() which calls
> >> - * update_blocked_averages()
> >> - */
> >> - if (!load)
> >> - runnable = running = 0;
> >> -
> >> - /*
> >> - * Now we know we crossed measurement unit boundaries. The *_avg
> >> - * accrues by two steps:
> >> - *
> >> - * Step 1: accumulate *_sum since last_update_time. If we haven't
> >> - * crossed period boundaries, finish.
> >> - */
> >> - if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> >> - return 0;
> >> -
> >> - return 1;
> >> -}
> >> -
> >> -static __always_inline void
> >> -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> >> -{
> >> - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >> -
> >> - /*
> >> - * Step 2: update *_avg.
> >> - */
> >> - sa->load_avg = div_u64(load * sa->load_sum, divider);
> >> - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> >> - sa->util_avg = sa->util_sum / divider;
> >> -}
> >> -
> >> -/*
> >> - * sched_entity:
> >> - *
> >> - * task:
> >> - * se_runnable() == se_weight()
> >> - *
> >> - * group: [ see update_cfs_group() ]
> >> - * se_weight() = tg->weight * grq->load_avg / tg->load_avg
> >> - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> >> - *
> >> - * load_sum := runnable_sum
> >> - * load_avg = se_weight(se) * runnable_avg
> >> - *
> >> - * runnable_load_sum := runnable_sum
> >> - * runnable_load_avg = se_runnable(se) * runnable_avg
> >> - *
> >> - * XXX collapse load_sum and runnable_load_sum
> >> - *
> >> - * cfq_rs:
> >> - *
> >> - * load_sum = \Sum se_weight(se) * se->avg.load_sum
> >> - * load_avg = \Sum se->avg.load_avg
> >> - *
> >> - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> >> - * runnable_load_avg = \Sum se->avg.runable_load_avg
> >> - */
> >> -
> >> -static int
> >> -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> >> -{
> >> - if (entity_is_task(se))
> >> - se->runnable_weight = se->load.weight;
> >> -
> >> - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> >> - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> - return 1;
> >> - }
> >> -
> >> - return 0;
> >> -}
> >> -
> >> -static int
> >> -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> -{
> >> - if (entity_is_task(se))
> >> - se->runnable_weight = se->load.weight;
> >> -
> >> - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> >> - cfs_rq->curr == se)) {
> >> -
> >> - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> - return 1;
> >> - }
> >> -
> >> - return 0;
> >> -}
> >> -
> >> -static int
> >> -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> >> -{
> >> - if (___update_load_sum(now, cpu, &cfs_rq->avg,
> >> - scale_load_down(cfs_rq->load.weight),
> >> - scale_load_down(cfs_rq->runnable_weight),
> >> - cfs_rq->curr != NULL)) {
> >> -
> >> - ___update_load_avg(&cfs_rq->avg, 1, 1);
> >> - return 1;
> >> - }
> >> -
> >> - return 0;
> >> -}
> >> -
> >> #ifdef CONFIG_FAIR_GROUP_SCHED
> >> /**
> >> * update_tg_load_avg - update the tg's load avg
> >> @@ -3831,12 +3531,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
> >>
> >> #else /* CONFIG_SMP */
> >>
> >> -static inline int
> >> -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> >> -{
> >> - return 0;
> >> -}
> >> -
> >> #define UPDATE_TG 0x0
> >> #define SKIP_AGE_LOAD 0x0
> >> #define DO_ATTACH 0x0
> >> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> >> new file mode 100644
> >> index 0000000..da6d84f
> >> --- /dev/null
> >> +++ b/kernel/sched/pelt.c
> >> @@ -0,0 +1,308 @@
> >> +/*
> >> + * Per Entity Load Tracking
> >> + *
> >> + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx>
> >> + *
> >> + * Interactivity improvements by Mike Galbraith
> >> + * (C) 2007 Mike Galbraith <efault@xxxxxx>
> >> + *
> >> + * Various enhancements by Dmitry Adamushko.
> >> + * (C) 2007 Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
> >> + *
> >> + * Group scheduling enhancements by Srivatsa Vaddagiri
> >> + * Copyright IBM Corporation, 2007
> >> + * Author: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx>
> >> + *
> >> + * Scaled math optimizations by Thomas Gleixner
> >> + * Copyright (C) 2007, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> >> + *
> >> + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> >> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> >> + *
> >> + * Move PELT related code from fair.c into this pelt.c file
> >> + * Author: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> >> + */
> >> +
> >> +#include <linux/sched.h>
> >> +#include "sched.h"
> >> +#include "sched-pelt.h"
> >> +
> >> +/*
> >> + * Approximate:
> >> + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> >> + */
> >> +static u64 decay_load(u64 val, u64 n)
> >> +{
> >> + unsigned int local_n;
> >> +
> >> + if (unlikely(n > LOAD_AVG_PERIOD * 63))
> >> + return 0;
> >> +
> >> + /* after bounds checking we can collapse to 32-bit */
> >> + local_n = n;
> >> +
> >> + /*
> >> + * As y^PERIOD = 1/2, we can combine
> >> + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> >> + * With a look-up table which covers y^n (n<PERIOD)
> >> + *
> >> + * To achieve constant time decay_load.
> >> + */
> >> + if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> >> + val >>= local_n / LOAD_AVG_PERIOD;
> >> + local_n %= LOAD_AVG_PERIOD;
> >> + }
> >> +
> >> + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> >> + return val;
> >> +}
> >> +
> >> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> >> +{
> >> + u32 c1, c2, c3 = d3; /* y^0 == 1 */
> >> +
> >> + /*
> >> + * c1 = d1 y^p
> >> + */
> >> + c1 = decay_load((u64)d1, periods);
> >> +
> >> + /*
> >> + * p-1
> >> + * c2 = 1024 \Sum y^n
> >> + * n=1
> >> + *
> >> + * inf inf
> >> + * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> >> + * n=0 n=p
> >> + */
> >> + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> >> +
> >> + return c1 + c2 + c3;
> >> +}
> >> +
> >> +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> >> +
> >> +/*
> >> + * Accumulate the three separate parts of the sum; d1 the remainder
> >> + * of the last (incomplete) period, d2 the span of full periods and d3
> >> + * the remainder of the (incomplete) current period.
> >> + *
> >> + * d1 d2 d3
> >> + * ^ ^ ^
> >> + * | | |
> >> + * |<->|<----------------->|<--->|
> >> + * ... |---x---|------| ... |------|-----x (now)
> >> + *
> >> + * p-1
> >> + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> >> + * n=1
> >> + *
> >> + * = u y^p + (Step 1)
> >> + *
> >> + * p-1
> >> + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
> >> + * n=1
> >> + */
> >> +static __always_inline u32
> >> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> >> + unsigned long load, unsigned long runnable, int running)
> >> +{
> >> + unsigned long scale_freq, scale_cpu;
> >> + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> >> + u64 periods;
> >> +
> >> + scale_freq = arch_scale_freq_capacity(NULL, cpu);
> >> + scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> >> +
> >> + delta += sa->period_contrib;
> >> + periods = delta / 1024; /* A period is 1024us (~1ms) */
> >> +
> >> + /*
> >> + * Step 1: decay old *_sum if we crossed period boundaries.
> >> + */
> >> + if (periods) {
> >> + sa->load_sum = decay_load(sa->load_sum, periods);
> >> + sa->runnable_load_sum =
> >> + decay_load(sa->runnable_load_sum, periods);
> >> + sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> >> +
> >> + /*
> >> + * Step 2
> >> + */
> >> + delta %= 1024;
> >> + contrib = __accumulate_pelt_segments(periods,
> >> + 1024 - sa->period_contrib, delta);
> >> + }
> >> + sa->period_contrib = delta;
> >> +
> >> + contrib = cap_scale(contrib, scale_freq);
> >> + if (load)
> >> + sa->load_sum += load * contrib;
> >> + if (runnable)
> >> + sa->runnable_load_sum += runnable * contrib;
> >> + if (running)
> >> + sa->util_sum += contrib * scale_cpu;
> >> +
> >> + return periods;
> >> +}
> >> +
> >> +/*
> >> + * We can represent the historical contribution to runnable average as the
> >> + * coefficients of a geometric series. To do this we sub-divide our runnable
> >> + * history into segments of approximately 1ms (1024us); label the segment that
> >> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> >> + *
> >> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> >> + * p0 p1 p2
> >> + * (now) (~1ms ago) (~2ms ago)
> >> + *
> >> + * Let u_i denote the fraction of p_i that the entity was runnable.
> >> + *
> >> + * We then designate the fractions u_i as our co-efficients, yielding the
> >> + * following representation of historical load:
> >> + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> >> + *
> >> + * We choose y based on the with of a reasonably scheduling period, fixing:
> >> + * y^32 = 0.5
> >> + *
> >> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> >> + * approximately half as much as the contribution to load within the last ms
> >> + * (u_0).
> >> + *
> >> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> >> + * sum again by y is sufficient to update:
> >> + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> >> + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> >> + */
> >> +static __always_inline int
> >> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> >> + unsigned long load, unsigned long runnable, int running)
> >> +{
> >> + u64 delta;
> >> +
> >> + delta = now - sa->last_update_time;
> >> + /*
> >> + * This should only happen when time goes backwards, which it
> >> + * unfortunately does during sched clock init when we swap over to TSC.
> >> + */
> >> + if ((s64)delta < 0) {
> >> + sa->last_update_time = now;
> >> + return 0;
> >> + }
> >> +
> >> + /*
> >> + * Use 1024ns as the unit of measurement since it's a reasonable
> >> + * approximation of 1us and fast to compute.
> >> + */
> >> + delta >>= 10;
> >> + if (!delta)
> >> + return 0;
> >> +
> >> + sa->last_update_time += delta << 10;
> >> +
> >> + /*
> >> + * running is a subset of runnable (weight) so running can't be set if
> >> + * runnable is clear. But there are some corner cases where the current
> >> + * se has been already dequeued but cfs_rq->curr still points to it.
> >> + * This means that weight will be 0 but not running for a sched_entity
> >> + * but also for a cfs_rq if the latter becomes idle. As an example,
> >> + * this happens during idle_balance() which calls
> >> + * update_blocked_averages()
> >> + */
> >> + if (!load)
> >> + runnable = running = 0;
> >> +
> >> + /*
> >> + * Now we know we crossed measurement unit boundaries. The *_avg
> >> + * accrues by two steps:
> >> + *
> >> + * Step 1: accumulate *_sum since last_update_time. If we haven't
> >> + * crossed period boundaries, finish.
> >> + */
> >> + if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> >> + return 0;
> >> +
> >> + return 1;
> >> +}
> >> +
> >> +static __always_inline void
> >> +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> >> +{
> >> + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >> +
> >> + /*
> >> + * Step 2: update *_avg.
> >> + */
> >> + sa->load_avg = div_u64(load * sa->load_sum, divider);
> >> + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> >> + sa->util_avg = sa->util_sum / divider;
> >> +}
> >> +
> >> +/*
> >> + * sched_entity:
> >> + *
> >> + * task:
> >> + * se_runnable() == se_weight()
> >> + *
> >> + * group: [ see update_cfs_group() ]
> >> + * se_weight() = tg->weight * grq->load_avg / tg->load_avg
> >> + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> >> + *
> >> + * load_sum := runnable_sum
> >> + * load_avg = se_weight(se) * runnable_avg
> >> + *
> >> + * runnable_load_sum := runnable_sum
> >> + * runnable_load_avg = se_runnable(se) * runnable_avg
> >> + *
> >> + * XXX collapse load_sum and runnable_load_sum
> >> + *
> >> + * cfq_rs:
> >> + *
> >> + * load_sum = \Sum se_weight(se) * se->avg.load_sum
> >> + * load_avg = \Sum se->avg.load_avg
> >> + *
> >> + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> >> + * runnable_load_avg = \Sum se->avg.runable_load_avg
> >> + */
> >> +
> >> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> >> +{
> >> + if (entity_is_task(se))
> >> + se->runnable_weight = se->load.weight;
> >> +
> >> + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> >> + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> + return 1;
> >> + }
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> +{
> >> + if (entity_is_task(se))
> >> + se->runnable_weight = se->load.weight;
> >> +
> >> + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> >> + cfs_rq->curr == se)) {
> >> +
> >> + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> + return 1;
> >> + }
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> >> +{
> >> + if (___update_load_sum(now, cpu, &cfs_rq->avg,
> >> + scale_load_down(cfs_rq->load.weight),
> >> + scale_load_down(cfs_rq->runnable_weight),
> >> + cfs_rq->curr != NULL)) {
> >> +
> >> + ___update_load_avg(&cfs_rq->avg, 1, 1);
> >> + return 1;
> >> + }
> >> +
> >> + return 0;
> >> +}
> >> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> >> new file mode 100644
> >> index 0000000..c312d8c
> >> --- /dev/null
> >> +++ b/kernel/sched/pelt.h
> >> @@ -0,0 +1,17 @@
> >> +#ifdef CONFIG_SMP
> >> +
> >> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
> >> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
> >> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
> >> +
> >> +#else
> >> +
> >> +static inline int
> >> +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> >> +{
> >> + return 0;
> >> +}
> >> +
> >> +#endif
> >> +
> >> +
> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >> index 45ab0bf..6fefef6 100644
> >> --- a/kernel/sched/sched.h
> >> +++ b/kernel/sched/sched.h
> >> @@ -528,6 +528,7 @@ struct rt_rq {
> >> unsigned long rt_nr_total;
> >> int overloaded;
> >> struct plist_head pushable_tasks;
> >> +
> >> #endif /* CONFIG_SMP */
> >> int rt_queued;
> >>
> >> @@ -602,7 +603,26 @@ struct dl_rq {
> >> u64 bw_ratio;
> >> };
> >>
> >> +#ifdef CONFIG_FAIR_GROUP_SCHED
> >> +/* An entity is a task if it doesn't "own" a runqueue */
> >> +#define entity_is_task(se) (!se->my_q)
> >> +#else
> >> +#define entity_is_task(se) 1
> >> +#endif
> >> +
> >> #ifdef CONFIG_SMP
> >> +/*
> >> + * XXX we want to get rid of these helpers and use the full load resolution.
> >> + */
> >> +static inline long se_weight(struct sched_entity *se)
> >> +{
> >> + return scale_load_down(se->load.weight);
> >> +}
> >> +
> >> +static inline long se_runnable(struct sched_entity *se)
> >> +{
> >> + return scale_load_down(se->runnable_weight);
> >> +}
> >>
> >> static inline bool sched_asym_prefer(int a, int b)
> >> {
> >> --
> >> 2.7.4
> >>