Re: [PATCH -next] sched/cputime: Fix the time backward issue about /proc/stat

From: zhengzucheng
Date: Thu Sep 29 2022 - 22:44:07 EST



在 2022/9/28 20:11, Frederic Weisbecker 写道:
On Wed, Sep 28, 2022 at 10:12:33AM +0200, Peter Zijlstra wrote:
On Wed, Sep 28, 2022 at 11:34:02AM +0800, Zucheng Zheng wrote:
From: Zheng Zucheng <zhengzucheng@xxxxxxxxxx>

The cputime of cpuN read from /proc/stat has an issue of cputime descent.
For example, the phenomenon of cputime descent of user is as followed:

The value read first is 319, and the value read again is 318. As follows:
first:
cat /proc/stat | grep cpu1
cpu1 319 0 496 41665 0 0 0 0 0 0
again:
cat /proc/stat | grep cpu1
cpu1 318 0 497 41674 0 0 0 0 0 0

The value read from /proc/stat should be monotonically increasing. Otherwise
user may get incorrect CPU usage.

The root cause of this problem is that, in the implementation of
kcpustat_cpu_fetch_vtime, vtime->utime + delta is added to the stack variable
cpustat instantaneously. If the task is switched between two times, the value
added to cpustat for the second time may be smaller than that for the first time.

CPU0 CPU1
First:
show_stat()
->kcpustat_cpu_fetch()
->kcpustat_cpu_fetch_vtime()
->cpustat[CPUTIME_USER] = kcpustat_cpu(cpu) + vtime->utime + delta rq->curr is task A
A switch to B,and A->vtime->utime is less than 1 tick
Then:
show_stat()
->kcpustat_cpu_fetch()
->kcpustat_cpu_fetch_vtime()
->cpustat[CPUTIME_USER] = kcpustat_cpu(cpu) + vtime->utime + delta; rq->curr is task B
You're still not explaining where the time gets lost. And the patch is a
horrible band-aid.

What I think you're saying; after staring at this for a while, is that:

vtime_task_switch_generic()
__vtime_account_kernel(prev, vtime)
vtime_account_{guest,system}(tsk, vtime)
vtime->*time += get_vtime_delta()
if (vtime->*time >= TICK_NSEC)
account_*_time()
account_system_index_time()
task_group_account_field()
__this_cpu_add(kernel_cpustat.cpustat[index], tmp); <---- here

is not folding time into kernel_cpustat when the task vtime isn't at
least a tick's worth. And then when we switch to another task, we leak
time.
Looks right. Last time the patch was posted I misunderstood the issue.

There's another problem here, vtime_task_switch_generic() should use a
single call to sched_clock() to compute the old vtime_delta and set the
new vtime->starttime, otherwise there's a time hole there as well.
Right, but does it really matter? It's just a few nanosecs ignored
between two tasks switching.

This is all quite the maze and it really wants cleaning up, not be made
worse.

So I think you want to do two things:

- pull kernel_cpustat updates out of task_group_account_field()
and put them into vtime_task_switch_generic() to be purely
vtime->starttime based.
So you want to force the update on all context switches? We used that TICK_NSEC
limit before updating in order to lower some overhead.

There is also user <-> kernel involved.

How about handling that from the read side? (below untested)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 78a233d43757..f0f1af337e49 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -814,9 +814,9 @@ u64 task_gtime(struct task_struct *t)
do {
seq = read_seqcount_begin(&vtime->seqcount);
- gtime = t->gtime;
+ gtime = t->gtime + vtime->gtime;
if (vtime->state == VTIME_GUEST)
- gtime += vtime->gtime + vtime_delta(vtime);
+ gtime += vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -845,8 +845,8 @@ bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
- *utime = t->utime;
- *stime = t->stime;
+ *utime = t->utime + vtime->utime;
+ *stime = t->stime + vtime->stime;
/* Task is sleeping or idle, nothing to add */
if (vtime->state < VTIME_SYS)
@@ -860,9 +860,9 @@ bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
* add pending nohz time to the right place.
*/
if (vtime->state == VTIME_SYS)
- *stime += vtime->stime + delta;
+ *stime += delta;
else
- *utime += vtime->utime + delta;
+ *utime += delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
return ret;
@@ -896,11 +896,22 @@ static int vtime_state_fetch(struct vtime *vtime, int cpu)
static u64 kcpustat_user_vtime(struct vtime *vtime)
{
- if (vtime->state == VTIME_USER)
- return vtime->utime + vtime_delta(vtime);
- else if (vtime->state == VTIME_GUEST)
- return vtime->gtime + vtime_delta(vtime);
- return 0;
+ u64 delta = vtime->utime + vtime->gtime;
+
+ if (vtime->state == VTIME_USER || vtime->state == VTIME_GUEST)
+ delta += vtime_delta(vtime);
+
+ return delta;
+}
+
+static u64 kcpustat_guest_vtime(struct vtime *vtime)
+{
+ u64 delta = vtime->gtime;
+
+ if (vtime->state == VTIME_GUEST)
+ delta += vtime_delta(vtime);
+
+ return delta;
}
static int kcpustat_field_vtime(u64 *cpustat,
@@ -931,8 +942,9 @@ static int kcpustat_field_vtime(u64 *cpustat,
*/
switch (usage) {
case CPUTIME_SYSTEM:
+ *val += vtime->stime;
if (state == VTIME_SYS)
- *val += vtime->stime + vtime_delta(vtime);
+ *val += vtime_delta(vtime);
break;
case CPUTIME_USER:
if (task_nice(tsk) <= 0)
@@ -943,12 +955,12 @@ static int kcpustat_field_vtime(u64 *cpustat,
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_GUEST:
- if (state == VTIME_GUEST && task_nice(tsk) <= 0)
- *val += vtime->gtime + vtime_delta(vtime);
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_guest_vtime(vtime);
break;
case CPUTIME_GUEST_NICE:
- if (state == VTIME_GUEST && task_nice(tsk) > 0)
- *val += vtime->gtime + vtime_delta(vtime);
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_guest_vtime(vtime);
break;
default:
break;
@@ -1013,6 +1025,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
*dst = *src;
cpustat = dst->cpustat;
+ cpustat[CPUTIME_SYSTEM] += vtime->stime;
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_NICE] += vtime->utime + vtime->gtime;
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime;
+ } else {
+ cpustat[CPUTIME_USER] += vtime->utime + vtime->gtime;
+ cpustat[CPUTIME_GUEST] += vtime->gtime;
+ }
+
/* Task is sleeping, dead or idle, nothing to add */
if (state < VTIME_SYS)
continue;
@@ -1024,20 +1045,20 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
* add pending nohz time to the right place.
*/
if (state == VTIME_SYS) {
- cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ cpustat[CPUTIME_SYSTEM] += delta;
} else if (state == VTIME_USER) {
if (task_nice(tsk) > 0)
- cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ cpustat[CPUTIME_NICE] += delta;
else
- cpustat[CPUTIME_USER] += vtime->utime + delta;
+ cpustat[CPUTIME_USER] += delta;
“delta” has the same problem as vtime->utime, which varies with different tasks. switching between different tasks may cause time statistics to be reversed.
} else {
WARN_ON_ONCE(state != VTIME_GUEST);
if (task_nice(tsk) > 0) {
- cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
- cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_GUEST_NICE] += delta;
+ cpustat[CPUTIME_NICE] += delta;
} else {
- cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
- cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ cpustat[CPUTIME_GUEST] += delta;
+ cpustat[CPUTIME_USER] += delta;
}
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
.