[PATCH v2 1/2] sched/rtmutex/deadline: Fix a PI crash for deadline tasks

From: Xunlei Pang
Date: Wed Apr 06 2016 - 08:59:50 EST


A crash happened while I'm playing with deadline PI rtmutex.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
PGD 232a75067 PUD 230947067 PMD 0
Oops: 0000 [#1] SMP
CPU: 1 PID: 10994 Comm: a.out Not tainted

Call Trace:
[<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
[<ffffffff810b658c>] enqueue_task+0x2c/0x80
[<ffffffff810ba763>] activate_task+0x23/0x30
[<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
[<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
[<ffffffff8164e783>] __schedule+0xd3/0x900
[<ffffffff8164efd9>] schedule+0x29/0x70
[<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
[<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
[<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
[<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
[<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
[<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
[<ffffffff810ed8b0>] do_futex+0x190/0x5b0
[<ffffffff810edd50>] SyS_futex+0x80/0x180
[<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
RIP [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task() will access them with rq lock held but
not holding pi_lock.

In order to tackle it, we introduce a new pointer "pi_top_task"
in task_struct, and update it to be the top waiter task(this waiter
is updated under pi_lock) in rt_mutex_setprio() which is under
both pi_lock and rq lock, then ensure all its accessers be under
rq lock (or pi_lock), this can safely fix the crash.

This patch is originated from "Peter Zijlstra", with several
tweaks and improvements by me.

Tested-by: Xunlei Pang <xlpang@xxxxxxxxxx>
Originally-From: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Xunlei Pang <xlpang@xxxxxxxxxx>
---
Without the patch, kernel crashes in seconds, after the patch
it can survive overnight.

include/linux/init_task.h | 3 +-
include/linux/sched.h | 1 +
include/linux/sched/deadline.h | 12 +++++++
include/linux/sched/rt.h | 21 ++----------
kernel/fork.c | 1 +
kernel/futex.c | 5 ++-
kernel/locking/rtmutex.c | 71 +++++++++++++++--------------------------
kernel/locking/rtmutex_common.h | 1 +
kernel/sched/core.c | 39 +++++++++++++++-------
kernel/sched/deadline.c | 2 +-
10 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d4..de834f3 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -162,7 +162,8 @@ extern struct task_group root_task_group;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
- .pi_waiters_leftmost = NULL,
+ .pi_waiters_leftmost = NULL, \
+ .pi_top_task = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea1..b4d9347 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,6 +1621,7 @@ struct task_struct {
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
+ struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2a..9f46729 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -26,4 +26,16 @@ static inline bool dl_time_before(u64 a, u64 b)
return (s64)(a - b) < 0;
}

+#ifdef CONFIG_RT_MUTEXES
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+ return p->pi_top_task;
+}
+#else
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+ return NULL;
+}
+#endif
+
#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172..4e35e94 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,31 +16,14 @@ static inline int rt_task(struct task_struct *p)
}

#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+extern void rt_mutex_setprio(struct task_struct *p,
+ struct task_struct *pi_top_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
-static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- return NULL;
-}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 45a9048..3ad84c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1209,6 +1209,7 @@ static void rt_mutex_init_task(struct task_struct *p)
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
+ p->pi_top_task = NULL;
#endif
}

diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74..e73145b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1326,9 +1326,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
* scheduled away before the wake up can take place.
*/
spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
- rt_mutex_adjust_prio(current);
+
+ rt_mutex_postunlock(&wake_q, deboost);

return 0;
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5e4294c..6c3a806 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
}

/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
-
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
-
-/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
static void __rt_mutex_adjust_prio(struct task_struct *task)
{
- int prio = rt_mutex_getprio(task);
+ struct task_struct *pi_top_task = task;

- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+ if (unlikely(task_has_pi_waiters(task)))
+ pi_top_task = task_top_pi_waiter(task)->task;
+
+ rt_mutex_setprio(task, pi_top_task);
}

/*
@@ -1403,14 +1368,30 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
} else {
bool deboost = slowfn(lock, &wake_q);

- wake_up_q(&wake_q);
-
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
+ rt_mutex_postunlock(&wake_q, deboost);
}
}

+/*
+ * Undo pi boosting (if necessary) and wake top waiter.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
+{
+ /*
+ * We should deboost before waking the high-prio task such that
+ * we don't run two tasks with the 'same' state. This however
+ * can lead to prio-inversion if we would get preempted after
+ * the deboost but before waking our high-prio task, hence the
+ * preempt_disable.
+ */
+ preempt_disable();
+ if (deboost)
+ rt_mutex_adjust_prio(current);
+
+ wake_up_q(wake_q);
+ preempt_enable();
+}
+
/**
* rt_mutex_lock - lock a rt_mutex
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c..93b0924 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -111,6 +111,7 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost);
extern void rt_mutex_adjust_prio(struct task_struct *task);

#ifdef CONFIG_DEBUG_RT_MUTEXES
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a..a533566 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3374,7 +3374,7 @@ EXPORT_SYMBOL(default_wake_function);
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task
- * @prio: prio value (kernel-internal form)
+ * @pi_top_task: top waiter, donating state
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3382,13 +3382,21 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_top_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- struct rq *rq;
+ int prio, oldprio, queued, running;
+ int queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *prev_class;
+ struct rq *rq;

- BUG_ON(prio > MAX_PRIO);
+ /*
+ * For FIFO/RR we simply donate prio; for DL things are
+ * more interesting.
+ */
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = min(pi_top_task->prio, p->normal_prio);
+ if (p->prio == prio && !dl_prio(prio))
+ return;

rq = __task_rq_lock(p);

@@ -3424,6 +3432,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
put_prev_task(rq, p);

+ if (pi_top_task == p)
+ pi_top_task = NULL;
+ p->pi_top_task = pi_top_task;
+
/*
* Boosting condition are:
* 1. -rt task is running and holds mutex A
@@ -3434,9 +3446,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_top_task &&
+ dl_entity_preempt(&pi_top_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -3709,10 +3721,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
- if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = normal_prio(p);
+ if (keep_boost && get_pi_top_task(p))
+ p->prio = min(p->prio, get_pi_top_task(p)->prio);

if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3999,7 +4010,11 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = newprio;
+ if (get_pi_top_task(p))
+ new_effective_prio =
+ min(new_effective_prio, get_pi_top_task(p)->prio);
+
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036f..e564c88 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -928,7 +928,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)

static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct task_struct *pi_task = get_pi_top_task(p);
struct sched_dl_entity *pi_se = &p->dl;

/*
--
1.8.3.1