[PATCH] posix-cpu-timers: Clear posix_cputimers_work in copy_process

From: Michael Pratt
Date: Mon Nov 01 2021 - 17:06:48 EST


copy_process currently copies task_struct.posix_cputimers_work as-is. If a
timer interrupt arrives while handling clone and before dup_task_struct
completes then the child task will have:

1. posix_cputimers_work.scheduled = true
2. posix_cputimers_work.work queued.

copy_process clears task_struct.task_works, so (2) will have no effect and
posix_cpu_timers_work will never run (not to mention it doesn't make sense
for two tasks to share a common linked list).

Since posix_cpu_timers_work never runs, posix_cputimers_work.scheduled is
never cleared. Since scheduled is set, future timer interrupts will skip
scheduling work, with the ultimate result that the task will never receive
timer expirations.

Together, the complete flow is:

1. Task 1 calls clone(), enters kernel.
2. Timer interrupt fires, schedules task work on Task 1.
2a. task_struct.posix_cputimers_work.scheduled = true
2b. task_struct.posix_cputimers_work.work added to
task_struct.task_works.
3. dup_task_struct copies Task 1 to Task 2.
4. copy_process clears task_struct.task_works for Task 2.
5. Future timer interrupts on Task 2 see
task_struct.posix_cputimers_work.scheduled = true and skip scheduling
work.

Fix this by explicitly clearing contents of
task_struct.posix_cputimers_work in copy_process. This was never meant to
be shared or inherited across tasks in the first place.

Signed-off-by: Michael Pratt <mpratt@xxxxxxxxxx>
Reported-by: Rhys Hiltner <rhys@xxxxxxxxx>
Fixes: 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work")
Cc: <stable@xxxxxxxxxxxxxxx>
---
This issue was discovered while investigating a flaky test in the Go
language standard libary, https://golang.org/issue/49065. After our testing
VMs upgraded from 5.4 to 5.10 kernels, several profiling tests started
failing ~1% of the time with threads not receiving their expected profiling
signals.

Bisection of problem by Rhys blamed b6b178e38f40 ("Merge tag
'timers-core-2020-08-14' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip"). This merge commit
introduced the broken commit 1fb497dd0030 ("posix-cpu-timers: Provide
mechanisms to defer timer handling to task_work") and its child
0099808553ad ("x86: Select POSIX_CPU_TIMERS_TASK_WORK"), which enables the
new codepath.

The C program below also reproduces the problem. Build with `gcc repro.c
-lrt -pthread -O2`.

The program starts a CPU timer on the main thread, which then spawns child
threads that create their own CPU timers and verify that they receive timer
signals. At HEAD and 0099808553ad this program fails with ~3-15 / 20000
threads not receiving signals.

Prior to 0099808553ad and with this patch, the program reports no failures.

// SPDX-License-Identifier: GPL-2.0
#include <pthread.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/syscall.h>
#include <time.h>
#include <unistd.h>

__thread uint64_t signaled;

_Atomic int threads_bad;

void signal_handler(int signo, siginfo_t *siginfo, void *uctx)
{
signaled++;
}

int gettid(void)
{
return syscall(SYS_gettid);
}

timer_t setup_timer(void)
{
struct sigevent sev = {
.sigev_signo = SIGPROF,
.sigev_notify = SIGEV_THREAD_ID,
._sigev_un = {
._tid = gettid(),
},
};
struct itimerspec spec = {
.it_interval = {
.tv_nsec = 10*1000*1000, /* 10ms */
},
.it_value = {
.tv_nsec = 10*1000*1000, /* 10ms */
},
};
timer_t timerid;
int ret;

ret = timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid);
if (ret != 0) {
perror("timer_create");
_exit(1);
}

ret = timer_settime(timerid, 0, &spec, NULL);
if (ret != 0) {
perror("timer_settime");
_exit(1);
}

return timerid;
}

uint64_t thread_cpu_ns(void)
{
struct timespec ts;
int ret;

ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
if (ret != 0) {
perror("clock_gettime");
_exit(1);
}
return ts.tv_nsec + 1000*1000*1000*ts.tv_sec;
}

void *thread(void *arg)
{
timer_t timerid;
uint64_t start;
int ret;

timerid = setup_timer();

start = thread_cpu_ns();
while (1) {
uint64_t now;

/* 50ms passed? */
now = thread_cpu_ns();
if (now - start > 50*1000*1000)
break;

/* Busy loop */
for (volatile int i = 0; i < 100000; i++)
;
}

/*
* 50ms passed; we should certainly have received some profiling
* signals.
*/
if (signaled == 0) {
printf("Thread %d received no profiling signals!\n", gettid());
threads_bad++;
}

ret = timer_delete(timerid);
if (ret != 0) {
perror("timer_delete");
_exit(1);
}

return NULL;
}

int main(void)
{
struct sigaction sa = {
.sa_sigaction = &signal_handler,
.sa_flags = SA_SIGINFO | SA_RESTART,
};
int ret;
sigset_t set;
timer_t timerid;
int bad;
int thread_count = 0;

ret = sigaction(SIGPROF, &sa, NULL);
if (ret != 0) {
perror("sigaction");
return 1;
}

sigemptyset(&set);
sigaddset(&set, SIGPROF);
ret = sigprocmask(SIG_UNBLOCK, &set, NULL);
if (ret != 0) {
perror("sigprocmask");
return 1;
}

timerid = setup_timer();

while (thread_count < 20000) {
pthread_t threads[10];

for (int i = 0; i < 10; i++) {
ret = pthread_create(&threads[i], NULL, &thread, NULL);
if (ret != 0) {
perror("pthread_create");
return 1;
}
thread_count++;
}

/* Busy loop */
for (volatile int i = 0; i < 100000; i++)
;

for (int i = 0; i < 10; i++) {
ret = pthread_join(threads[i], NULL);
if (ret != 0) {
perror("pthread_join");
return 1;
}
}

if (thread_count % 100 == 0)
printf("%d threads\n", thread_count);
}

bad = threads_bad;
printf("Bad threads %d / %d = %f%%\n", threads_bad, thread_count,
100*((double)threads_bad) / ((double)thread_count));

if (threads_bad > 0)
return 1;
return 0;
}

include/linux/posix-timers.h | 2 ++
kernel/fork.c | 1 +
kernel/time/posix-cpu-timers.c | 19 +++++++++++++++++--
3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 00fef0064355..5bbcd280bfd2 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -184,8 +184,10 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
#endif

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+void clear_posix_cputimers_work(struct task_struct *p);
void posix_cputimers_init_work(void);
#else
+static inline void clear_posix_cputimers_work(struct task_struct *p) { }
static inline void posix_cputimers_init_work(void) { }
#endif

diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..b1551c074b74 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2280,6 +2280,7 @@ static __latent_entropy struct task_struct *copy_process(
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 643d412ac623..96b4e7810426 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1158,14 +1158,29 @@ static void posix_cpu_timers_work(struct callback_head *work)
handle_posix_cpu_timers(current);
}

+/*
+ * Clear existing posix CPU timers task work.
+ */
+void clear_posix_cputimers_work(struct task_struct *p)
+{
+ /*
+ * A copied work entry from the old task is not meaningful, clear it.
+ * N.B. init_task_work will not do this.
+ */
+ memset(&p->posix_cputimers_work.work, 0,
+ sizeof(p->posix_cputimers_work.work));
+ init_task_work(&p->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+ p->posix_cputimers_work.scheduled = false;
+}
+
/*
* Initialize posix CPU timers task work in init task. Out of line to
* keep the callback static and to avoid header recursion hell.
*/
void __init posix_cputimers_init_work(void)
{
- init_task_work(&current->posix_cputimers_work.work,
- posix_cpu_timers_work);
+ clear_posix_cputimers_work(current);
}

/*
--
2.33.1.1089.g2158813163f-goog