Fwd: Possible race in rt_mutex_adjust_prio_chain
From: Henry Wu
Date: Thu Jul 06 2023 - 22:09:22 EST
I forget to CC linux-kernel, sorry for duplicate mail.
Hi, Peter.
Peter Zijlstra <peterz@xxxxxxxxxxxxx> 于2023年7月6日周四 20:01写道:
>
> On Thu, Jul 06, 2023 at 02:08:20PM +0800, Henry Wu wrote:
> > Hi,
> >
> > I found that it's not safe to change waiter->prio after waiter
> > dequeued from mutex's waiter rbtree because it's still on owner's
> > pi_waiters rbtree. From my analysis, waiters on pi_waiters rbtree
> > should be protected by pi_lock of task which have pi_waiters and
> > corresponding rt_mutex's wait_lock, but pi_waiters is shared by many
> > locks owned by this task, so actually we serialize changes on
> > pi_waiters only by pi_lock.
> >
> > `rt_mutex_adjust_prio_chain' changes key of nodes of pi_waiters rbtree
> > without pi_lock and pi_waiters rbtree's invariant is violated. Maybe
> > we are enqueuing waiter on other cpu and pi_waiters rbtree will be
> > corrupted.
>
> Are you talking about [7];
>
> Where we do waiter_update_prio() while only
> holding [L] rtmutex->wait_lock.
>
> VS
>
> rt_mutex_adjust_prio() / task_top_pi_waiter() that accesses ->pi_waiters
> while holding [P] task->pi_lock.
>
> ?
>
> I'll go stare at that in more detail -- but I wanted to verify that's
> what you're talking about.
>
I refered step 7. I checked every call site of rt_mutex_adjust_prio()
and all of them are protected by the right pi_lock.
Imagine a scenario that we have two rt_mutex (M1, M2) and three
threads (A, B, C). Both rt_mutex are owned by thread A. B is blocked
when acquiring M1 and C is blocked when acquiring M2. We use
sched_setattr to change priority of B and C.
CPU0 CPU1
......... ..........
rt_mutex_adjust_prio_chain(C)
rt_mutex_adjust_prio_chain(B)
......
[7] update waiter->prio
(Now A's pi_waiters rbtree is
corrupted temporarily)
...... [11] enqueue
operation may select
insert position
according to corrupted
waiter node CPU0 created.
[11] Even though we fixed
corrupted waiter now, we
are not sure about pi_waiters's
sanity because other cpu may
create new invariant violation
based on our violation.
> > I attach a source file which can trigger this violation. I tested it
> > on Ubuntu 20.04 LTS with 5.4 kernel.
>
> Well, that's a horribly old kernel :-( Please double check on v6.4 and
> consult that code for the discussion above -- I'm really not too
> interested in debugging something ancient.
I revised my code to make it work on 6.4.2 and fixed one logic error.
I tested it on Fedora 38 with kernel 6.4.2-858.vanilla.fc38.x86_64.
You can find the new code in attachment.
$ sudo ./a.out
.........................
prio: -21
prio: -21
prio: -21
prio: -21
prio: -21
prio: -21
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
prio: -20
PID LWP TTY TIME CMD
4564 4564 pts/0 00:00:01 a.out
4564 4565 pts/0 00:00:00 waiter-0
4564 4566 pts/0 00:00:00 waiter-1
4564 4567 pts/0 00:00:00 waiter-2
4564 4568 pts/0 00:00:00 waiter-3
4564 4569 pts/0 00:00:00 waiter-4
4564 4570 pts/0 00:00:00 waiter-5
4564 4571 pts/0 00:00:00 waiter-6
4564 4572 pts/0 00:00:00 waiter-7
4564 4573 pts/0 00:00:00 waiter-8
4564 4574 pts/0 00:00:00 waiter-9
4564 4575 pts/0 00:00:00 waiter-10
4564 4576 pts/0 00:00:00 waiter-11
4564 4577 pts/0 00:00:00 waiter-12
4564 4578 pts/0 00:00:00 waiter-13
4564 4579 pts/0 00:00:00 waiter-14
4564 4580 pts/0 00:00:00 waiter-15
4564 4581 pts/0 00:00:00 waiter-16
4564 4582 pts/0 00:00:00 waiter-17
4564 4583 pts/0 00:00:00 waiter-18
4564 4584 pts/0 00:00:00 waiter-19
4564 4585 pts/0 00:00:00 changer-0
4564 4586 pts/0 00:00:00 changer-1
4564 4587 pts/0 00:00:00 changer-2
4564 4588 pts/0 00:00:00 changer-3
4564 4589 pts/0 00:00:00 changer-4
4564 4590 pts/0 00:00:00 changer-5
4564 4591 pts/0 00:00:00 changer-6
4564 4592 pts/0 00:00:00 changer-7
4564 4593 pts/0 00:00:00 changer-8
4564 4594 pts/0 00:00:00 changer-9
4564 4595 pts/0 00:00:00 changer-10
4564 4596 pts/0 00:00:00 changer-11
4564 4597 pts/0 00:00:00 changer-12
4564 4598 pts/0 00:00:00 changer-13
4564 4599 pts/0 00:00:00 changer-14
4564 4600 pts/0 00:00:00 changer-15
4564 4601 pts/0 00:00:00 changer-16
4564 4602 pts/0 00:00:00 changer-17
4564 4603 pts/0 00:00:00 changer-18
4564 4604 pts/0 00:00:00 changer-19
found race, hang...
$ sudo crash --no_module
.....................
crash> task -R prio,normal_prio,rt_priority,pi_waiters 4564
PID: 4564 TASK: ffff9b7c8480a8c0 CPU: 3 COMMAND: "a.out"
prio = 80,
normal_prio = 120,
rt_priority = 0,
pi_waiters = {
rb_root = {
rb_node = 0xffffb5bad2ddfcf8
},
rb_leftmost = 0xffffb5bad2da7d98
},
crash> print (struct rb_node *)0xffffb5bad2ddfcf8
$1 = (struct rb_node *) 0xffffb5bad2ddfcf8
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$2 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 1,
rb_right = 0xffffb5bad2df7d28,
rb_left = 0xffffb5bad2dafd68
},
task = 0xffff9b7c80388000,
lock = 0xffff9b7caa2cceb0,
wake_state = 3,
prio = 89,
deadline = 0,
ww_ctx = 0x0
}
crash> print $1->rb_left
$3 = (struct rb_node *) 0xffffb5bad2dafd68
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$4 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446662412739149049,
rb_right = 0xffffb5bad2defdb8,
rb_left = 0xffffb5bad2dbfd30
},
task = 0xffff9b7d2bca28c0,
lock = 0xffff9b7d004a2970,
wake_state = 3,
prio = 83,
deadline = 0,
ww_ctx = 0x0
}
crash> print $1->rb_left->rb_left
$5 = (struct rb_node *) 0xffffb5bad2dbfd30
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$6 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446662412738952552,
rb_right = 0xffffb5bad2d87d18,
rb_left = 0xffffb5bad2da7d98
},
task = 0xffff9b7cfd55a8c0,
lock = 0xffff9b7caa2cc6d0,
wake_state = 3,
prio = 79,
deadline = 0,
ww_ctx = 0x0
}
crash> print $1->rb_left->rb_left->rb_left
$7 = (struct rb_node *) 0xffffb5bad2da7d98
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$8 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446662412739018033,
rb_right = 0x0,
rb_left = 0x0
},
task = 0xffff9b7c80bd0000,
lock = 0xffff9b7caa2ccb50,
wake_state = 3,
prio = 80,
deadline = 0,
ww_ctx = 0x0
}
crash>
Key order invariant of pi_waiters had been violated by the last two
waiters above.
Thanks.
Henry
#define _GNU_SOURCE
#include <assert.h>
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>
#include <sys/syscall.h>
#define PCOUNT 20
#define EXPECTED_PRIO -21
// Copied from related header.
struct sched_attr {
uint32_t size;
uint32_t sched_policy;
uint64_t sched_flags;
/* SCHED_NORMAL, SCHED_BATCH */
int32_t sched_nice;
/* SCHED_FIFO, SCHED_RR */
uint32_t sched_priority;
/* SCHED_DEADLINE */
uint64_t sched_runtime;
uint64_t sched_deadline;
uint64_t sched_period;
/* Utilization hints */
uint32_t sched_util_min;
uint32_t sched_util_max;
};
static pthread_mutex_t mutexes[PCOUNT];
static void init_mutex(pthread_mutex_t *mutex) {
pthread_mutexattr_t attr;
int ret;
ret = pthread_mutexattr_init(&attr);
assert(!ret);
ret = pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT);
assert(!ret);
ret = pthread_mutex_init(mutex, &attr);
assert(!ret);
pthread_mutexattr_destroy(&attr);
}
enum thread_type {
WAITER,
PRIO_CHANGER,
};
static struct thread_spec {
enum thread_type type;
int name_index;
pthread_barrier_t *barrier;
// Used by waiter.
pthread_mutex_t *wait_mutex;
pid_t pid_save;
// Used by prio changer.
pid_t *waiter_pid;
uint32_t prio;
} thread_specs[PCOUNT * 2];
static pthread_t threads[PCOUNT * 2];
static int thread_count = 0;
static int barrier_wait(pthread_barrier_t *barrier) {
if (!barrier) {
return 0;
}
int ret = pthread_barrier_wait(barrier);
assert(!ret || ret == PTHREAD_BARRIER_SERIAL_THREAD);
return ret;
}
static int sched_getattr(pid_t pid, struct sched_attr *attr, int flags) {
return syscall(SYS_sched_getattr, pid, attr, sizeof(*attr), flags);
}
static int sched_setattr(pid_t pid, struct sched_attr *attr, int flags) {
return syscall(SYS_sched_setattr, pid, attr, flags);
}
static void *prio_change_loop(struct thread_spec *spec) {
struct sched_attr attr;
int ret;
for (;;) {
barrier_wait(spec->barrier);
ret = sched_getattr(*spec->waiter_pid, &attr, 0);
if (ret < 0) {
perror("sched_getattr");
assert(0);
}
attr.sched_policy = SCHED_RR;
attr.sched_priority = spec->prio;
ret = sched_setattr(*spec->waiter_pid, &attr, 0);
if (ret < 0) {
perror("sched_setattr");
exit(1);
}
barrier_wait(spec->barrier);
}
}
static void *thread(void *arg) {
struct thread_spec *spec = arg;
char name[64];
int ret;
snprintf(name, sizeof(name), "%s-%d",
spec->type == WAITER ? "waiter" : "changer",
spec->name_index);
ret = pthread_setname_np(pthread_self(), name);
assert(!ret);
switch (spec->type) {
case WAITER:
spec->pid_save = gettid();
barrier_wait(spec->barrier);
ret = pthread_mutex_lock(spec->wait_mutex);
assert(!ret);
break;
case PRIO_CHANGER:
prio_change_loop(spec);
break;
default:
assert(0);
break;
}
return NULL;
}
static void create_thread(struct thread_spec *spec) {
int ret;
ret = pthread_create(&threads[thread_count++], NULL, thread, spec);
assert(!ret);
}
static int print_prio() {
FILE *file = fopen("/proc/self/stat", "r");
assert(file);
char comm[64];
int tmp, prio, ret;
ret = fscanf(file, "%d %s %c %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
&tmp, comm, comm, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
&tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &prio);
assert(ret == 18);
printf("prio: %d\n", prio);
fclose(file);
return prio;
}
static void print_threads(pid_t process_pid) {
char cmd[128];
snprintf(cmd, sizeof(cmd), "ps -L -p %d", process_pid);
system(cmd);
}
int main(void) {
pid_t pid;
int ret;
pid = getpid();
printf("pid: %d\n", pid);
for (int i = 0; i < sizeof(mutexes) / sizeof(mutexes[0]); ++i) {
init_mutex(&mutexes[i]);
ret = pthread_mutex_lock(&mutexes[i]);
assert(!ret);
}
pthread_barrier_t barrier;
ret = pthread_barrier_init(&barrier, NULL, PCOUNT + 1);
assert(!ret);
for (int i = 0; i < PCOUNT; ++i) {
struct thread_spec *spec = &thread_specs[i];
spec->type = WAITER;
spec->name_index = i;
spec->wait_mutex = &mutexes[i];
spec->barrier = &barrier;
create_thread(spec);
}
// Wait for filling of pid_save.
barrier_wait(&barrier);
for (int i = 0; i < PCOUNT; ++i) {
struct thread_spec *spec = &thread_specs[i + PCOUNT];
spec->type = PRIO_CHANGER;
spec->name_index = i;
spec->prio = 99;
spec->barrier = &barrier;
spec->waiter_pid = &thread_specs[i].pid_save;
create_thread(spec);
}
print_prio();
print_threads(pid);
srandom(time(NULL));
int iter = 0;
for (;;) {
++iter;
for (int i = 0; i < PCOUNT; ++i) {
thread_specs[i + PCOUNT].prio = (iter % 2) ? i + 1 : PCOUNT - i;
}
for (int i = 0; i < PCOUNT; ++i) {
int pos = random() % PCOUNT;
int tmp = thread_specs[pos + PCOUNT].prio;
thread_specs[pos + PCOUNT].prio = thread_specs[PCOUNT].prio;
thread_specs[PCOUNT].prio = tmp;
}
barrier_wait(&barrier);
barrier_wait(&barrier);
//system("echo iteration > /sys/kernel/debug/tracing/trace_marker");
for (int try = 0; ; ++try) {
int prio = print_prio();
if (prio == EXPECTED_PRIO) {
// Try a new iteration.
break;
}
if (try >= 10) {
print_threads(pid);
printf("found race, hang...\n");
while (1) {
sleep(3600);
}
}
}
}
for (int i = 0; i < thread_count; ++i) {
pthread_join(threads[i], NULL);
}
return 0;
}