Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated file

From: Quentin Perret
Date: Mon Dec 11 2017 - 09:08:47 EST


Hi Vincent,

Although I agree that moving the PELT code in a dedicated file is
probably the cleanest way to achieve what you want, I was wondering if
you were able no measure any overhead due to moving the __update_load_avg_*()
functions in a different translation unit ? This is introducing function calls
in latency sensitive paths (wakeup for ex) so I'm just wondering what's
the impact in practice.

Thanks,
Quentin

On Wednesday 22 Nov 2017 at 15:35:53 (+0100), Vincent Guittot wrote:
> We want to track rt_rq's utilization as a part of the estimation of the
> whole rq's utilization. This is necessary because rt tasks can steal
> utilization to cfs tasks and make them lighter than they are.
> As we want to use the same load tracking mecanism for both and prevent
> useless dependency between cfs and rt code, pelt code is moved in a
> dedicated file.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> ---
> kernel/sched/Makefile | 2 +-
> kernel/sched/fair.c | 308 +-------------------------------------------------
> kernel/sched/pelt.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/pelt.h | 17 +++
> kernel/sched/sched.h | 20 ++++
> 5 files changed, 347 insertions(+), 308 deletions(-)
> create mode 100644 kernel/sched/pelt.c
> create mode 100644 kernel/sched/pelt.h
>
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index e2f9d4f..5a6d1c1 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,7 +19,7 @@ endif
> obj-y += core.o loadavg.o clock.o cputime.o
> obj-y += idle_task.o fair.o rt.o deadline.o
> obj-y += wait.o wait_bit.o swait.o completion.o idle.o
> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
> obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
> obj-$(CONFIG_SCHEDSTATS) += stats.o
> obj-$(CONFIG_SCHED_DEBUG) += debug.o
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0989676..b88550e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -270,9 +270,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> return cfs_rq->rq;
> }
>
> -/* An entity is a task if it doesn't "own" a runqueue */
> -#define entity_is_task(se) (!se->my_q)
> -
> static inline struct task_struct *task_of(struct sched_entity *se)
> {
> SCHED_WARN_ON(!entity_is_task(se));
> @@ -434,7 +431,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> return container_of(cfs_rq, struct rq, cfs);
> }
>
> -#define entity_is_task(se) 1
>
> #define for_each_sched_entity(se) \
> for (; se; se = NULL)
> @@ -707,7 +703,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> }
>
> #ifdef CONFIG_SMP
> -
> +#include "pelt.h"
> #include "sched-pelt.h"
>
> static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
> @@ -2723,19 +2719,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> } while (0)
>
> #ifdef CONFIG_SMP
> -/*
> - * XXX we want to get rid of these helpers and use the full load resolution.
> - */
> -static inline long se_weight(struct sched_entity *se)
> -{
> - return scale_load_down(se->load.weight);
> -}
> -
> -static inline long se_runnable(struct sched_entity *se)
> -{
> - return scale_load_down(se->runnable_weight);
> -}
> -
> static inline void
> enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> @@ -3038,289 +3021,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
> }
>
> #ifdef CONFIG_SMP
> -/*
> - * Approximate:
> - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> - */
> -static u64 decay_load(u64 val, u64 n)
> -{
> - unsigned int local_n;
> -
> - if (unlikely(n > LOAD_AVG_PERIOD * 63))
> - return 0;
> -
> - /* after bounds checking we can collapse to 32-bit */
> - local_n = n;
> -
> - /*
> - * As y^PERIOD = 1/2, we can combine
> - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> - * With a look-up table which covers y^n (n<PERIOD)
> - *
> - * To achieve constant time decay_load.
> - */
> - if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> - val >>= local_n / LOAD_AVG_PERIOD;
> - local_n %= LOAD_AVG_PERIOD;
> - }
> -
> - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> - return val;
> -}
> -
> -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> -{
> - u32 c1, c2, c3 = d3; /* y^0 == 1 */
> -
> - /*
> - * c1 = d1 y^p
> - */
> - c1 = decay_load((u64)d1, periods);
> -
> - /*
> - * p-1
> - * c2 = 1024 \Sum y^n
> - * n=1
> - *
> - * inf inf
> - * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> - * n=0 n=p
> - */
> - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> -
> - return c1 + c2 + c3;
> -}
> -
> -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> -
> -/*
> - * Accumulate the three separate parts of the sum; d1 the remainder
> - * of the last (incomplete) period, d2 the span of full periods and d3
> - * the remainder of the (incomplete) current period.
> - *
> - * d1 d2 d3
> - * ^ ^ ^
> - * | | |
> - * |<->|<----------------->|<--->|
> - * ... |---x---|------| ... |------|-----x (now)
> - *
> - * p-1
> - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> - * n=1
> - *
> - * = u y^p + (Step 1)
> - *
> - * p-1
> - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
> - * n=1
> - */
> -static __always_inline u32
> -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> - unsigned long load, unsigned long runnable, int running)
> -{
> - unsigned long scale_freq, scale_cpu;
> - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> - u64 periods;
> -
> - scale_freq = arch_scale_freq_capacity(NULL, cpu);
> - scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
> - delta += sa->period_contrib;
> - periods = delta / 1024; /* A period is 1024us (~1ms) */
> -
> - /*
> - * Step 1: decay old *_sum if we crossed period boundaries.
> - */
> - if (periods) {
> - sa->load_sum = decay_load(sa->load_sum, periods);
> - sa->runnable_load_sum =
> - decay_load(sa->runnable_load_sum, periods);
> - sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> -
> - /*
> - * Step 2
> - */
> - delta %= 1024;
> - contrib = __accumulate_pelt_segments(periods,
> - 1024 - sa->period_contrib, delta);
> - }
> - sa->period_contrib = delta;
> -
> - contrib = cap_scale(contrib, scale_freq);
> - if (load)
> - sa->load_sum += load * contrib;
> - if (runnable)
> - sa->runnable_load_sum += runnable * contrib;
> - if (running)
> - sa->util_sum += contrib * scale_cpu;
> -
> - return periods;
> -}
> -
> -/*
> - * We can represent the historical contribution to runnable average as the
> - * coefficients of a geometric series. To do this we sub-divide our runnable
> - * history into segments of approximately 1ms (1024us); label the segment that
> - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> - *
> - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> - * p0 p1 p2
> - * (now) (~1ms ago) (~2ms ago)
> - *
> - * Let u_i denote the fraction of p_i that the entity was runnable.
> - *
> - * We then designate the fractions u_i as our co-efficients, yielding the
> - * following representation of historical load:
> - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> - *
> - * We choose y based on the with of a reasonably scheduling period, fixing:
> - * y^32 = 0.5
> - *
> - * This means that the contribution to load ~32ms ago (u_32) will be weighted
> - * approximately half as much as the contribution to load within the last ms
> - * (u_0).
> - *
> - * When a period "rolls over" and we have new u_0`, multiplying the previous
> - * sum again by y is sufficient to update:
> - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> - */
> -static __always_inline int
> -___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> - unsigned long load, unsigned long runnable, int running)
> -{
> - u64 delta;
> -
> - delta = now - sa->last_update_time;
> - /*
> - * This should only happen when time goes backwards, which it
> - * unfortunately does during sched clock init when we swap over to TSC.
> - */
> - if ((s64)delta < 0) {
> - sa->last_update_time = now;
> - return 0;
> - }
> -
> - /*
> - * Use 1024ns as the unit of measurement since it's a reasonable
> - * approximation of 1us and fast to compute.
> - */
> - delta >>= 10;
> - if (!delta)
> - return 0;
> -
> - sa->last_update_time += delta << 10;
> -
> - /*
> - * running is a subset of runnable (weight) so running can't be set if
> - * runnable is clear. But there are some corner cases where the current
> - * se has been already dequeued but cfs_rq->curr still points to it.
> - * This means that weight will be 0 but not running for a sched_entity
> - * but also for a cfs_rq if the latter becomes idle. As an example,
> - * this happens during idle_balance() which calls
> - * update_blocked_averages()
> - */
> - if (!load)
> - runnable = running = 0;
> -
> - /*
> - * Now we know we crossed measurement unit boundaries. The *_avg
> - * accrues by two steps:
> - *
> - * Step 1: accumulate *_sum since last_update_time. If we haven't
> - * crossed period boundaries, finish.
> - */
> - if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> - return 0;
> -
> - return 1;
> -}
> -
> -static __always_inline void
> -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> -{
> - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> -
> - /*
> - * Step 2: update *_avg.
> - */
> - sa->load_avg = div_u64(load * sa->load_sum, divider);
> - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> - sa->util_avg = sa->util_sum / divider;
> -}
> -
> -/*
> - * sched_entity:
> - *
> - * task:
> - * se_runnable() == se_weight()
> - *
> - * group: [ see update_cfs_group() ]
> - * se_weight() = tg->weight * grq->load_avg / tg->load_avg
> - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> - *
> - * load_sum := runnable_sum
> - * load_avg = se_weight(se) * runnable_avg
> - *
> - * runnable_load_sum := runnable_sum
> - * runnable_load_avg = se_runnable(se) * runnable_avg
> - *
> - * XXX collapse load_sum and runnable_load_sum
> - *
> - * cfq_rs:
> - *
> - * load_sum = \Sum se_weight(se) * se->avg.load_sum
> - * load_avg = \Sum se->avg.load_avg
> - *
> - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> - * runnable_load_avg = \Sum se->avg.runable_load_avg
> - */
> -
> -static int
> -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> -{
> - if (entity_is_task(se))
> - se->runnable_weight = se->load.weight;
> -
> - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> - return 1;
> - }
> -
> - return 0;
> -}
> -
> -static int
> -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> -{
> - if (entity_is_task(se))
> - se->runnable_weight = se->load.weight;
> -
> - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> - cfs_rq->curr == se)) {
> -
> - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> - return 1;
> - }
> -
> - return 0;
> -}
> -
> -static int
> -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> -{
> - if (___update_load_sum(now, cpu, &cfs_rq->avg,
> - scale_load_down(cfs_rq->load.weight),
> - scale_load_down(cfs_rq->runnable_weight),
> - cfs_rq->curr != NULL)) {
> -
> - ___update_load_avg(&cfs_rq->avg, 1, 1);
> - return 1;
> - }
> -
> - return 0;
> -}
> -
> #ifdef CONFIG_FAIR_GROUP_SCHED
> /**
> * update_tg_load_avg - update the tg's load avg
> @@ -3831,12 +3531,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
>
> #else /* CONFIG_SMP */
>
> -static inline int
> -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> -{
> - return 0;
> -}
> -
> #define UPDATE_TG 0x0
> #define SKIP_AGE_LOAD 0x0
> #define DO_ATTACH 0x0
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> new file mode 100644
> index 0000000..da6d84f
> --- /dev/null
> +++ b/kernel/sched/pelt.c
> @@ -0,0 +1,308 @@
> +/*
> + * Per Entity Load Tracking
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx>
> + *
> + * Interactivity improvements by Mike Galbraith
> + * (C) 2007 Mike Galbraith <efault@xxxxxx>
> + *
> + * Various enhancements by Dmitry Adamushko.
> + * (C) 2007 Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
> + *
> + * Group scheduling enhancements by Srivatsa Vaddagiri
> + * Copyright IBM Corporation, 2007
> + * Author: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx>
> + *
> + * Scaled math optimizations by Thomas Gleixner
> + * Copyright (C) 2007, Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> + *
> + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> + *
> + * Move PELT related code from fair.c into this pelt.c file
> + * Author: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> + */
> +
> +#include <linux/sched.h>
> +#include "sched.h"
> +#include "sched-pelt.h"
> +
> +/*
> + * Approximate:
> + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
> + */
> +static u64 decay_load(u64 val, u64 n)
> +{
> + unsigned int local_n;
> +
> + if (unlikely(n > LOAD_AVG_PERIOD * 63))
> + return 0;
> +
> + /* after bounds checking we can collapse to 32-bit */
> + local_n = n;
> +
> + /*
> + * As y^PERIOD = 1/2, we can combine
> + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> + * With a look-up table which covers y^n (n<PERIOD)
> + *
> + * To achieve constant time decay_load.
> + */
> + if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> + val >>= local_n / LOAD_AVG_PERIOD;
> + local_n %= LOAD_AVG_PERIOD;
> + }
> +
> + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> + return val;
> +}
> +
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> +{
> + u32 c1, c2, c3 = d3; /* y^0 == 1 */
> +
> + /*
> + * c1 = d1 y^p
> + */
> + c1 = decay_load((u64)d1, periods);
> +
> + /*
> + * p-1
> + * c2 = 1024 \Sum y^n
> + * n=1
> + *
> + * inf inf
> + * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> + * n=0 n=p
> + */
> + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> +
> + return c1 + c2 + c3;
> +}
> +
> +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> +
> +/*
> + * Accumulate the three separate parts of the sum; d1 the remainder
> + * of the last (incomplete) period, d2 the span of full periods and d3
> + * the remainder of the (incomplete) current period.
> + *
> + * d1 d2 d3
> + * ^ ^ ^
> + * | | |
> + * |<->|<----------------->|<--->|
> + * ... |---x---|------| ... |------|-----x (now)
> + *
> + * p-1
> + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> + * n=1
> + *
> + * = u y^p + (Step 1)
> + *
> + * p-1
> + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
> + * n=1
> + */
> +static __always_inline u32
> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> + unsigned long load, unsigned long runnable, int running)
> +{
> + unsigned long scale_freq, scale_cpu;
> + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> + u64 periods;
> +
> + scale_freq = arch_scale_freq_capacity(NULL, cpu);
> + scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
> + delta += sa->period_contrib;
> + periods = delta / 1024; /* A period is 1024us (~1ms) */
> +
> + /*
> + * Step 1: decay old *_sum if we crossed period boundaries.
> + */
> + if (periods) {
> + sa->load_sum = decay_load(sa->load_sum, periods);
> + sa->runnable_load_sum =
> + decay_load(sa->runnable_load_sum, periods);
> + sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> +
> + /*
> + * Step 2
> + */
> + delta %= 1024;
> + contrib = __accumulate_pelt_segments(periods,
> + 1024 - sa->period_contrib, delta);
> + }
> + sa->period_contrib = delta;
> +
> + contrib = cap_scale(contrib, scale_freq);
> + if (load)
> + sa->load_sum += load * contrib;
> + if (runnable)
> + sa->runnable_load_sum += runnable * contrib;
> + if (running)
> + sa->util_sum += contrib * scale_cpu;
> +
> + return periods;
> +}
> +
> +/*
> + * We can represent the historical contribution to runnable average as the
> + * coefficients of a geometric series. To do this we sub-divide our runnable
> + * history into segments of approximately 1ms (1024us); label the segment that
> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> + *
> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> + * p0 p1 p2
> + * (now) (~1ms ago) (~2ms ago)
> + *
> + * Let u_i denote the fraction of p_i that the entity was runnable.
> + *
> + * We then designate the fractions u_i as our co-efficients, yielding the
> + * following representation of historical load:
> + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> + *
> + * We choose y based on the with of a reasonably scheduling period, fixing:
> + * y^32 = 0.5
> + *
> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> + * approximately half as much as the contribution to load within the last ms
> + * (u_0).
> + *
> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> + * sum again by y is sufficient to update:
> + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> + */
> +static __always_inline int
> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> + unsigned long load, unsigned long runnable, int running)
> +{
> + u64 delta;
> +
> + delta = now - sa->last_update_time;
> + /*
> + * This should only happen when time goes backwards, which it
> + * unfortunately does during sched clock init when we swap over to TSC.
> + */
> + if ((s64)delta < 0) {
> + sa->last_update_time = now;
> + return 0;
> + }
> +
> + /*
> + * Use 1024ns as the unit of measurement since it's a reasonable
> + * approximation of 1us and fast to compute.
> + */
> + delta >>= 10;
> + if (!delta)
> + return 0;
> +
> + sa->last_update_time += delta << 10;
> +
> + /*
> + * running is a subset of runnable (weight) so running can't be set if
> + * runnable is clear. But there are some corner cases where the current
> + * se has been already dequeued but cfs_rq->curr still points to it.
> + * This means that weight will be 0 but not running for a sched_entity
> + * but also for a cfs_rq if the latter becomes idle. As an example,
> + * this happens during idle_balance() which calls
> + * update_blocked_averages()
> + */
> + if (!load)
> + runnable = running = 0;
> +
> + /*
> + * Now we know we crossed measurement unit boundaries. The *_avg
> + * accrues by two steps:
> + *
> + * Step 1: accumulate *_sum since last_update_time. If we haven't
> + * crossed period boundaries, finish.
> + */
> + if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> + return 0;
> +
> + return 1;
> +}
> +
> +static __always_inline void
> +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> +{
> + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> +
> + /*
> + * Step 2: update *_avg.
> + */
> + sa->load_avg = div_u64(load * sa->load_sum, divider);
> + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> + sa->util_avg = sa->util_sum / divider;
> +}
> +
> +/*
> + * sched_entity:
> + *
> + * task:
> + * se_runnable() == se_weight()
> + *
> + * group: [ see update_cfs_group() ]
> + * se_weight() = tg->weight * grq->load_avg / tg->load_avg
> + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> + *
> + * load_sum := runnable_sum
> + * load_avg = se_weight(se) * runnable_avg
> + *
> + * runnable_load_sum := runnable_sum
> + * runnable_load_avg = se_runnable(se) * runnable_avg
> + *
> + * XXX collapse load_sum and runnable_load_sum
> + *
> + * cfq_rs:
> + *
> + * load_sum = \Sum se_weight(se) * se->avg.load_sum
> + * load_avg = \Sum se->avg.load_avg
> + *
> + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> + * runnable_load_avg = \Sum se->avg.runable_load_avg
> + */
> +
> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> +{
> + if (entity_is_task(se))
> + se->runnable_weight = se->load.weight;
> +
> + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + if (entity_is_task(se))
> + se->runnable_weight = se->load.weight;
> +
> + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> + cfs_rq->curr == se)) {
> +
> + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> +{
> + if (___update_load_sum(now, cpu, &cfs_rq->avg,
> + scale_load_down(cfs_rq->load.weight),
> + scale_load_down(cfs_rq->runnable_weight),
> + cfs_rq->curr != NULL)) {
> +
> + ___update_load_avg(&cfs_rq->avg, 1, 1);
> + return 1;
> + }
> +
> + return 0;
> +}
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> new file mode 100644
> index 0000000..c312d8c
> --- /dev/null
> +++ b/kernel/sched/pelt.h
> @@ -0,0 +1,17 @@
> +#ifdef CONFIG_SMP
> +
> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
> +
> +#else
> +
> +static inline int
> +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> +{
> + return 0;
> +}
> +
> +#endif
> +
> +
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 45ab0bf..6fefef6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -528,6 +528,7 @@ struct rt_rq {
> unsigned long rt_nr_total;
> int overloaded;
> struct plist_head pushable_tasks;
> +
> #endif /* CONFIG_SMP */
> int rt_queued;
>
> @@ -602,7 +603,26 @@ struct dl_rq {
> u64 bw_ratio;
> };
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +/* An entity is a task if it doesn't "own" a runqueue */
> +#define entity_is_task(se) (!se->my_q)
> +#else
> +#define entity_is_task(se) 1
> +#endif
> +
> #ifdef CONFIG_SMP
> +/*
> + * XXX we want to get rid of these helpers and use the full load resolution.
> + */
> +static inline long se_weight(struct sched_entity *se)
> +{
> + return scale_load_down(se->load.weight);
> +}
> +
> +static inline long se_runnable(struct sched_entity *se)
> +{
> + return scale_load_down(se->runnable_weight);
> +}
>
> static inline bool sched_asym_prefer(int a, int b)
> {
> --
> 2.7.4
>