[PATCH] sched: fix incorrect wait time and wait count statistics

From: Joonwoo Park
Date: Sun Oct 25 2015 - 01:24:08 EST

Next message: Bendik Rønning Opstad: "Re: [PATCH RFC net-next 1/2] tcp: Add DPIFL thin stream detection mechanism"
Previous message: Linus Torvalds: "Linux 4.3-rc7"
Next in thread: Peter Zijlstra: "Re: [PATCH] sched: fix incorrect wait time and wait count statistics"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

At present scheduler resets task's wait start timestamp when the task
migrates to another rq. This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and don't
count migration as a wait end event to fix such statistics error.

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Signed-off-by: Joonwoo Park <joonwoop@xxxxxxxxxxxxxx>
---
kernel/sched/core.c | 4 ++--
kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++-------------
kernel/sched/sched.h | 2 ++
3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..4f20895 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1069,7 +1069,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);

- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_MIGRATING);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
@@ -1079,7 +1079,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_MIGRATING);
check_preempt_curr(rq, p, 0);

return rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..53ec4d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,27 +738,41 @@ static void update_curr_fair(struct rq *rq)
}

static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ bool migrating)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ schedstat_set(se->statistics.wait_start,
+ migrating &&
+ likely(rq_clock(rq_of(cfs_rq)) > se->statistics.wait_start) ?
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start :
+ rq_clock(rq_of(cfs_rq)));
}

/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ bool migrating)
{
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
+ update_stats_wait_start(cfs_rq, se, migrating);
}

static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ bool migrating)
{
+ if (migrating) {
+ schedstat_set(se->statistics.wait_start,
+ rq_clock(rq_of(cfs_rq)) -
+ se->statistics.wait_start);
+ return;
+ }
+
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
@@ -774,14 +788,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
}

static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ bool migrating)
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se, migrating);
}

/*
@@ -2960,7 +2975,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
enqueue_sleeper(cfs_rq, se);
}

- update_stats_enqueue(cfs_rq, se);
+ update_stats_enqueue(cfs_rq, se, !!(flags & ENQUEUE_MIGRATING));
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
@@ -3028,7 +3043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);

- update_stats_dequeue(cfs_rq, se);
+ update_stats_dequeue(cfs_rq, se, !!(flags & DEQUEUE_MIGRATING));
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
@@ -3114,7 +3129,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se, false);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3212,7 +3227,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)

check_spread(cfs_rq, prev);
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ update_stats_wait_start(cfs_rq, prev, false);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -5656,7 +5671,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);

- deactivate_task(env->src_rq, p, 0);
+ deactivate_task(env->src_rq, p, DEQUEUE_MIGRATING);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
}
@@ -5791,7 +5806,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)

BUG_ON(task_rq(p) != rq);
p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_MIGRATING);
check_preempt_curr(rq, p, 0);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119..c1ac118 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1165,8 +1165,10 @@ static const u32 prio_to_wmult[40] = {
#define ENQUEUE_WAKING 0
#endif
#define ENQUEUE_REPLENISH 8
+#define ENQUEUE_MIGRATING 16

#define DEQUEUE_SLEEP 1
+#define DEQUEUE_MIGRATING 2

#define RETRY_TASK ((void *)-1UL)

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Bendik Rønning Opstad: "Re: [PATCH RFC net-next 1/2] tcp: Add DPIFL thin stream detection mechanism"
Previous message: Linus Torvalds: "Linux 4.3-rc7"
Next in thread: Peter Zijlstra: "Re: [PATCH] sched: fix incorrect wait time and wait count statistics"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]