Possible race in rt_mutex_adjust_prio_chain

From: Henry Wu
Date: Thu Jul 06 2023 - 02:09:35 EST


Hi,

I found that it's not safe to change waiter->prio after waiter
dequeued from mutex's waiter rbtree because it's still on owner's
pi_waiters rbtree. From my analysis, waiters on pi_waiters rbtree
should be protected by pi_lock of task which have pi_waiters and
corresponding rt_mutex's wait_lock, but pi_waiters is shared by many
locks owned by this task, so actually we serialize changes on
pi_waiters only by pi_lock.

`rt_mutex_adjust_prio_chain' changes key of nodes of pi_waiters rbtree
without pi_lock and pi_waiters rbtree's invariant is violated. Maybe
we are enqueuing waiter on other cpu and pi_waiters rbtree will be
corrupted.

I attach a source file which can trigger this violation. I tested it
on Ubuntu 20.04 LTS with 5.4 kernel.

$ sudo ./a.out
pid: 39949
prio: 39
PID LWP TTY TIME CMD
39949 39949 pts/1 00:00:00 a.out
39949 39950 pts/1 00:00:00 waiter-0
39949 39951 pts/1 00:00:00 waiter-1
.............
prio: 20
prio: 20
prio: 20
prio: 20
prio: 20
prio: 20
prio: 20
.............
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
prio: 21
PID LWP TTY TIME CMD
39949 39949 pts/1 00:00:00 a.out
39949 39950 pts/1 00:00:00 waiter-0
39949 39951 pts/1 00:00:00 waiter-1
39949 39952 pts/1 00:00:00 waiter-2
39949 39953 pts/1 00:00:00 waiter-3
39949 39954 pts/1 00:00:00 waiter-4
39949 39955 pts/1 00:00:00 waiter-5
39949 39956 pts/1 00:00:00 waiter-6
39949 39957 pts/1 00:00:00 waiter-7
39949 39958 pts/1 00:00:00 waiter-8
39949 39959 pts/1 00:00:00 waiter-9
39949 39960 pts/1 00:00:00 waiter-10
39949 39961 pts/1 00:00:00 waiter-11
39949 39962 pts/1 00:00:00 waiter-12
39949 39963 pts/1 00:00:00 waiter-13
39949 39964 pts/1 00:00:00 waiter-14
39949 39965 pts/1 00:00:00 waiter-15
39949 39966 pts/1 00:00:00 waiter-16
39949 39967 pts/1 00:00:00 waiter-17
39949 39968 pts/1 00:00:00 waiter-18
39949 39969 pts/1 00:00:00 changer-0
39949 39970 pts/1 00:00:00 changer-1
39949 39971 pts/1 00:00:00 changer-2
39949 39972 pts/1 00:00:00 changer-3
39949 39973 pts/1 00:00:00 changer-4
39949 39974 pts/1 00:00:00 changer-5
39949 39975 pts/1 00:00:00 changer-6
39949 39976 pts/1 00:00:00 changer-7
39949 39977 pts/1 00:00:00 changer-8
39949 39978 pts/1 00:00:00 changer-9
39949 39979 pts/1 00:00:00 changer-10
39949 39980 pts/1 00:00:00 changer-11
39949 39981 pts/1 00:00:00 changer-12
39949 39982 pts/1 00:00:00 changer-13
39949 39983 pts/1 00:00:00 changer-14
39949 39984 pts/1 00:00:00 changer-15
39949 39985 pts/1 00:00:00 changer-16
39949 39986 pts/1 00:00:00 changer-17
39949 39987 pts/1 00:00:00 changer-18
found race, hang...

$ crash
crash> task -R prio,normal_prio,pi_waiters 39949
PID: 39949 TASK: ffff956a5c8b2f00 CPU: 2 COMMAND: "a.out"
prio = 121,
normal_prio = 139,
pi_waiters = {
rb_root = {
rb_node = 0xffffb24941f43d40
},
rb_leftmost = 0xffffb24941f8bd40
},

crash> print (struct rb_node *)0xffffb24941f43d40
$62 = (struct rb_node *) 0xffffb24941f43d40
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$63 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 1,
rb_right = 0xffffb24941f13d40,
rb_left = 0xffffb24941f4bd40
},
task = 0xffff956a6c008000,
lock = 0xffff956920b80970,
prio = 130,
deadline = 0
}
crash> print $62->rb_left
$64 = (struct rb_node *) 0xffffb24941f4bd40
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$65 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446658626441723201,
rb_right = 0xffffb24941f63d40,
rb_left = 0xffffb24941f5bd40
},
task = 0xffff956a594baf00,
lock = 0xffff956920b80190,
prio = 126,
deadline = 0
}
crash> print $62->rb_left->rb_left
$66 = (struct rb_node *) 0xffffb24941f5bd40
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$67 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446658626441755968,
rb_right = 0xffffb24941f73d40,
rb_left = 0xffffb24941f2bd40
},
task = 0xffff956a64edaf00,
lock = 0xffff956a8d481670,
prio = 123,
deadline = 0
}
crash> print $62->rb_left->rb_left->rb_left
$68 = (struct rb_node *) 0xffffb24941f2bd40
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$69 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446658626441821505,
rb_right = 0xffffb24940e47d40,
rb_left = 0xffffb24941f8bd40
},
task = 0xffff956a62ae0000,
lock = 0xffff956a76c5c250,
prio = 120,
deadline = 0
}
crash> print $62->rb_left->rb_left->rb_left->rb_left
$70 = (struct rb_node *) 0xffffb24941f8bd40
crash> print *(struct rt_mutex_waiter *)((void *)$ - 24)
$71 = {
tree_entry = {
__rb_parent_color = 1,
rb_right = 0x0,
rb_left = 0x0
},
pi_tree_entry = {
__rb_parent_color = 18446658626441624896,
rb_right = 0x0,
rb_left = 0x0
},
task = 0xffff956a597dc680,
lock = 0xffff956920b802b0,
prio = 121,
deadline = 0
}
crash>

>From the last two rt_mutex_waiter we see that leftmost waiter's prio
is 121 but it's parent is 120.

Sorry for the inconvenience if I made a mistake. Thanks!

Henry
#define _GNU_SOURCE

#include <assert.h>
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>
#include <sys/syscall.h>

#define PCOUNT 19

// Copied from related header.
struct sched_attr {
uint32_t size;

uint32_t sched_policy;
uint64_t sched_flags;

/* SCHED_NORMAL, SCHED_BATCH */
int32_t sched_nice;

/* SCHED_FIFO, SCHED_RR */
uint32_t sched_priority;

/* SCHED_DEADLINE */
uint64_t sched_runtime;
uint64_t sched_deadline;
uint64_t sched_period;

/* Utilization hints */
uint32_t sched_util_min;
uint32_t sched_util_max;
};

static pthread_mutex_t mutexes[PCOUNT];

static void init_mutex(pthread_mutex_t *mutex) {
pthread_mutexattr_t attr;
int ret;

ret = pthread_mutexattr_init(&attr);
assert(!ret);
ret = pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT);
assert(!ret);

ret = pthread_mutex_init(mutex, &attr);
assert(!ret);

pthread_mutexattr_destroy(&attr);
}

enum thread_type {
WAITER,
PRIO_CHANGER,
};

static struct thread_spec {
enum thread_type type;
int name_index;
int nice;
pthread_barrier_t *barrier;

// Used by waiter.
pthread_mutex_t *wait_mutex;
pid_t pid_save;

// Used by prio changer.
pid_t *waiter_pid;
} thread_specs[PCOUNT * 2];

static pthread_t threads[PCOUNT * 2];
static int thread_count = 0;

static int barrier_wait(pthread_barrier_t *barrier) {
if (!barrier) {
return 0;
}

int ret = pthread_barrier_wait(barrier);
assert(!ret || ret == PTHREAD_BARRIER_SERIAL_THREAD);

return ret;
}

static int sched_getattr(pid_t pid, struct sched_attr *attr, int flags) {
return syscall(SYS_sched_getattr, pid, attr, sizeof(*attr), flags);
}

static int sched_setattr(pid_t pid, struct sched_attr *attr, int flags) {
return syscall(SYS_sched_setattr, pid, attr, flags);
}

static void *prio_change_loop(struct thread_spec *spec) {
struct sched_attr attr;
int ret;

for (;;) {
barrier_wait(spec->barrier);

ret = sched_getattr(*spec->waiter_pid, &attr, 0);
if (ret < 0) {
perror("sched_getattr");
assert(0);
}

attr.sched_nice = spec->nice;

ret = sched_setattr(*spec->waiter_pid, &attr, 0);
if (ret < 0) {
perror("sched_setattr");
exit(1);
}

barrier_wait(spec->barrier);
}
}

static void *thread(void *arg) {
struct thread_spec *spec = arg;
char name[64];
int ret;

snprintf(name, sizeof(name), "%s-%d",
spec->type == WAITER ? "waiter" : "changer",
spec->name_index);
ret = pthread_setname_np(pthread_self(), name);
assert(!ret);

switch (spec->type) {
case WAITER:
spec->pid_save = gettid();

barrier_wait(spec->barrier);

ret = nice(spec->nice);
assert(ret >= 0);

ret = pthread_mutex_lock(spec->wait_mutex);
assert(!ret);
break;
case PRIO_CHANGER:
prio_change_loop(spec);
break;
default:
assert(0);
break;
}

return NULL;
}

static void create_thread(struct thread_spec *spec) {
int ret;

ret = pthread_create(&threads[thread_count++], NULL, thread, spec);
assert(!ret);
}

static int print_prio() {
FILE *file = fopen("/proc/self/stat", "r");
assert(file);

char comm[64];
int tmp, prio, ret;
ret = fscanf(file, "%d %s %c %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
&tmp, comm, comm, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
&tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &prio);
assert(ret == 18);

printf("prio: %d\n", prio);

fclose(file);

return prio;
}

static void print_threads(pid_t process_pid) {
char cmd[128];
snprintf(cmd, sizeof(cmd), "ps -L -p %d", process_pid);
system(cmd);
}

int main(void) {
pid_t pid;
int ret;

pid = getpid();
printf("pid: %d\n", pid);

for (int i = 0; i < sizeof(mutexes) / sizeof(mutexes[0]); ++i) {
init_mutex(&mutexes[i]);
ret = pthread_mutex_lock(&mutexes[i]);
assert(!ret);
}

pthread_barrier_t barrier;
ret = pthread_barrier_init(&barrier, NULL, PCOUNT + 1);
assert(!ret);

for (int i = 0; i < PCOUNT; ++i) {
struct thread_spec *spec = &thread_specs[i];

spec->type = WAITER;
spec->name_index = i;
spec->nice = 18;
spec->wait_mutex = &mutexes[i];
spec->barrier = &barrier;

create_thread(spec);
}

// Wait for filling of pid_save.
barrier_wait(&barrier);

for (int i = 0; i < PCOUNT; ++i) {
struct thread_spec *spec = &thread_specs[i + 19];

spec->type = PRIO_CHANGER;
spec->name_index = i;
spec->nice = 0;
spec->barrier = &barrier;
spec->waiter_pid = &thread_specs[i].pid_save;

create_thread(spec);
}

nice(19);

print_prio();
print_threads(pid);

//sleep(60);
//printf("launch!\n");

srandom(time(NULL));
int iter = 0;
for (;;) {
++iter;
for (int i = 0; i < PCOUNT; ++i) {
thread_specs[i + 19].nice = (iter % 2) ? i : 18 - i;
}

for (int i = 0; i < PCOUNT; ++i) {
int pos = random() % PCOUNT;
int tmp = thread_specs[pos + 19].nice;
thread_specs[pos + 19].nice = thread_specs[19].nice;
thread_specs[19].nice = tmp;
}

barrier_wait(&barrier);
barrier_wait(&barrier);

//system("echo iteration > /sys/kernel/debug/tracing/trace_marker");

int prio, unexpected_times = 0;
do {
prio = print_prio();
if (prio != 20) {
++unexpected_times;
if (unexpected_times > 10) {
print_threads(pid);
printf("found race, hang...\n");
while (1) {
sleep(3600);
}
}
}
} while (prio != 20);
}

for (int i = 0; i < thread_count; ++i) {
pthread_join(threads[i], NULL);
}

return 0;
}