Re: [PATCH v2 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity

From: Peter Zijlstra
Date: Wed May 03 2017 - 14:00:47 EST




This is on my IVB-EP, 2 sockets, 10 cores / socket, 2 threads / core.

workload is constrained to 1 socket.

root@ivb-ep:~/bench/schbench# numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30
Latency percentiles (usec)
50.0000th: 21
75.0000th: 30
90.0000th: 38
95.0000th: 42
*99.0000th: 49
99.5000th: 53
99.9000th: 15024
min=0, max=15056


root@ivb-ep:~/bench/schbench# echo NO_FUDGE > /debug/sched_features; echo NO_FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
50.0000th: 14
75.0000th: 19
90.0000th: 24
95.0000th: 28
*99.0000th: 57
99.5000th: 15024
99.9000th: 15024
min=0, max=15060


root@ivb-ep:~/bench/schbench# echo NO_FUDGE > /debug/sched_features; echo FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
50.0000th: 14
75.0000th: 19
90.0000th: 24
95.0000th: 26
*99.0000th: 38
99.5000th: 49
99.9000th: 9648
min=0, max=15035


root@ivb-ep:~/bench/schbench# echo FUDGE > /debug/sched_features; echo FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
50.0000th: 14
75.0000th: 19
90.0000th: 24
95.0000th: 27
*99.0000th: 3060
99.5000th: 7848
99.9000th: 15024
min=0, max=15041


root@ivb-ep:~/bench/schbench# echo 0 > /sys/module/fair/parameters/prop_type
root@ivb-ep:~/bench/schbench# echo FUDGE > /debug/sched_features; echo FUDGE2 > /debug/sched_features ; mkdir /cgroup/ponies; echo $$ > /cgroup/ponies/tasks ; numactl -N 0 ./schbench -m 2 -t 10 -s 10000 -c 15000 -r 30 ; echo $$ > /cgroup/tasks ; rmdir /cgroup/ponies
Latency percentiles (usec)
50.0000th: 14
75.0000th: 19
90.0000th: 24
95.0000th: 27
*99.0000th: 52
99.5000th: 4712
99.9000th: 14640
min=0, max=15033


Just FUDGE2 on its own seems to be the best on my system and is a change
that makes sense (and something Paul recently pointed out as well).

The implementation isn't particularly pretty or fast, but should
illustrate the idea.

Poking at the whole update_tg_cfs_load() thing only makes it worse after
that. And while I agree that that code is mind bending; it seems to work
OK-ish.

Tejun, Vincent, could you guys have a poke?

The thing is that if we assume se->avg.load_avg is correct, we should
already compute a correct cfs_rq->runnable_load_avg, we do all that
propagation right.

But because se->avg.load_avg is stuck in the 'past' because it's sum is
based on all its old weight, things don't quite work out. If we otoh
treat it as a runnable_sum and scale with weight, it seems to work out
fine.

Arguably sys_nice and all related crud should do the same, but nobody
really uses nice at any frequency, whereas we constantly change the
weight of our group entities.

---
kernel/sched/fair.c | 184 +++++++++++++++++++++++++++++++-----------------
kernel/sched/features.h | 2 +
2 files changed, 122 insertions(+), 64 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cd6c3f9..d6a33e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,6 +32,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/moduleparam.h>

#include <trace/events/sched.h>

@@ -2632,16 +2633,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+
+enum shares_type {
+ shares_runnable,
+ shares_avg,
+ shares_weight,
+};
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, enum shares_type shares_type)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;

- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ switch (shares_type) {
+ case shares_runnable:
+ load = cfs_rq->runnable_load_avg;
+ break;
+
+ default:
+ case shares_avg:
+ load = cfs_rq->avg.load_avg;
+ break;
+
+ case shares_weight:
+ /*
+ * This really should be: cfs_rq->avg.load_avg, but instead we
+ * use cfs_rq->load.weight, which is its upper bound. This
+ * helps ramp up the shares for small weight interactive tasks.
+ */
+ load = scale_load_down(cfs_rq->load.weight);
+ break;
+ }

tg_weight = atomic_long_read(&tg->load_avg);

@@ -2665,23 +2689,33 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- return tg->shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
# endif /* CONFIG_SMP */

+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ if (se->load.weight == weight)
+ return;
+
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
@@ -2689,10 +2723,40 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_dequeue(cfs_rq, se);
}

+ if (sched_feat(FUDGE2)) {
+ unsigned long new_weight = max(scale_load_down(weight), 1UL);
+ unsigned long old_weight = max(scale_load_down(se->load.weight), 1UL);
+
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+
+ if (se->on_rq) {
+ sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->runnable_load_sum, se->avg.load_sum);
+ }
+
+ se->avg.load_avg *= new_weight;
+ se->avg.load_sum *= new_weight;
+
+ se->avg.load_avg /= old_weight;
+ se->avg.load_sum /= old_weight;
+ }
+
update_load_set(&se->load, weight);

- if (se->on_rq)
+ if (se->on_rq) {
account_entity_enqueue(cfs_rq, se);
+ }
+
+ if (sched_feat(FUDGE2)) {
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se->avg.load_sum;
+
+ if (se->on_rq) {
+ cfs_rq->runnable_load_avg += se->avg.load_avg;
+ cfs_rq->runnable_load_sum += se->avg.load_sum;
+ }
+ }
}

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -2700,7 +2764,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_shares(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg;
long shares;

if (!cfs_rq)
@@ -2709,13 +2772,14 @@ static void update_cfs_shares(struct sched_entity *se)
if (throttled_hierarchy(cfs_rq))
return;

- tg = cfs_rq->tg;
-
#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
+ shares = READ_ONCE(cfs_rq->tg->shares);
+
+ if (likely(se->load.weight == shares))
return;
+#else
+ shares = calc_cfs_shares(cfs_rq, shares_weight);
#endif
- shares = calc_cfs_shares(cfs_rq, tg);

reweight_entity(cfs_rq_of(se), se, shares);
}
@@ -3070,42 +3134,51 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}

+static int prop_type = shares_avg;
+
+module_param(prop_type, int, 0644);
+
/* Take into account change of load of a child task group */
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
+ long delta, load;

/*
* If the load of group cfs_rq is null, the load of the
* sched_entity will also be null so we can skip the formula
*/
- if (load) {
- long tg_load;
+ if (!sched_feat(FUDGE)) {
+ load = gcfs_rq->avg.load_avg;
+ if (load) {
+ long tg_load;

- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;

- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;

- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
}
+ } else {
+ load = calc_cfs_shares(gcfs_rq, prop_type);
}

delta = load - se->avg.load_avg;
@@ -3236,23 +3309,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
}
}

-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index dc4d148..4c517b4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -80,3 +80,5 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)

+SCHED_FEAT(FUDGE, true)
+SCHED_FEAT(FUDGE2, true)