[PATCH v3 5/6] sched/deadline/rtmutex: Fix unprotected PI access in enqueue_task_dl()

From: Xunlei Pang
Date: Thu Apr 14 2016 - 07:37:54 EST


We access @pi_task's data without any lock in enqueue_task_dl(), though
checked "dl_prio(pi_task->normal_prio)" condition, that's not enough.

"dl_period" and "dl_runtime" of "pi_task->dl" can change. For example,
if it changes to !deadline class, dl_runtime will be cleared to zero,
then we will hit an endless loop in replenish_dl_entity() below:
while (dl_se->runtime <= 0) {
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
}

or hit "BUG_ON(pi_se->dl_runtime <= 0)" earlier.

That's because without any lock of top waiter, there is no guarantee.

In order to solve it, we add some members in "rt_mutex_waiter" and use
this structure instead of task_struct as the one to be accessed by
enqueue_task_dl(), specifically added:

struct rt_mutex_waiter {
... ...

int prio;
+ /* Updated under waiter's pi_lock and rt_mutex lock */
+ u64 dl_runtime, dl_period;
+ /*
+ * Copied directly from above.
+ * Updated under owner's pi_lock, rq lock, and rt_mutex lock.
+ */
+ u64 dl_runtime_copy, dl_period_copy;
};

We must update "dl_runtime_copy" and "dl_period_copy" under rt_mutex
lock, because they are copied from rt_mutex_waiter's "dl_runtime" and
"dl_period" which are protected by the same rt_mutex lock. We update
the copy in rt_mutex_update_copy() introduced perviously, as it is
called by rt_mutex_setprio() which held owner's pi_lock, rq lock, and
rt_mutex lock.

"dl_runtime_copy" and "dl_period_copy" are updated under owner's pi_lock,
rq lock, and rt_mutex lock, plus the waiter was dependably blocked on
rtmutex, thus they can be safely accessed by enqueue_task_dl() which
held rq lock.

Note that, now we return a rt_mutex_waiter to enqueue_task_dl(), we add
a new "struct sched_dl_entity_fake" to fake as a real sched_dl_entity,
this is ok as long as we keep the "dl_runtime" and "dl_period" in it
the same order as that in sched_dl_entity. Also adjust the location of
"dl_period" in sched_dl_entity to make sched_dl_entity_fake smaller.

Signed-off-by: Xunlei Pang <xlpang@xxxxxxxxxx>
---
include/linux/rtmutex.h | 7 +++++++
include/linux/sched.h | 2 +-
include/linux/sched/deadline.h | 20 ++++++++++++++++++++
kernel/locking/rtmutex.c | 23 +++++++++++++++++++++++
kernel/sched/deadline.c | 10 +++++++---
5 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index f9bf40a..56e2aaf 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -58,6 +58,13 @@ struct rt_mutex_waiter {
struct rt_mutex *deadlock_lock;
#endif
int prio;
+ /* Updated under waiter's pi_lock and rt_mutex lock */
+ u64 dl_runtime, dl_period;
+ /*
+ * Copied directly from above.
+ * Updated under owner's pi_lock, rq lock, and rt_mutex lock.
+ */
+ u64 dl_runtime_copy, dl_period_copy;
};

struct hrtimer_sleeper;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ad3522..960465c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1323,8 +1323,8 @@ struct sched_dl_entity {
* the next sched_setattr().
*/
u64 dl_runtime; /* maximum runtime for each instance */
- u64 dl_deadline; /* relative deadline of each instance */
u64 dl_period; /* separation of two instances (period) */
+ u64 dl_deadline; /* relative deadline of each instance */
u64 dl_bw; /* dl_runtime / dl_deadline */

/*
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index e8304d4..ca5bae5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -2,6 +2,16 @@
#define _SCHED_DEADLINE_H

/*
+ * Used by enqueue_task_dl() for PI cases to disguise sched_dl_entity,
+ * thus must be the same order as the counterparts in sched_dl_entity.
+ */
+struct sched_dl_entity_fake {
+ struct rb_node rb_node;
+ u64 dl_runtime;
+ u64 dl_period;
+};
+
+/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
* the fact that any of them has higher prio than RT and
* NORMAL/BATCH tasks.
@@ -28,4 +38,14 @@ static inline bool dl_time_before(u64 a, u64 b)

extern void rt_mutex_update_copy(struct task_struct *p);

+#ifdef CONFIG_RT_MUTEXES
+extern struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p);
+#else
+static inline
+struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *p)
+{
+ return NULL;
+}
+#endif
+
#endif /* _SCHED_DEADLINE_H */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 00c6560..4d14eee 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -280,6 +280,15 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
struct rt_mutex_waiter, pi_tree_entry)->task;
}

+struct rt_mutex_waiter *rt_mutex_get_top_waiter(struct task_struct *task)
+{
+ if (!task->pi_waiters_leftmost_copy)
+ return NULL;
+
+ return rb_entry(task->pi_waiters_leftmost_copy,
+ struct rt_mutex_waiter, pi_tree_entry);
+}
+
/*
* Called by sched_setscheduler() to get the priority which will be
* effective after the change.
@@ -299,7 +308,17 @@ int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
*/
void rt_mutex_update_copy(struct task_struct *p)
{
+ struct rt_mutex_waiter *top_waiter;
+
+ /* We must always update it, even if NULL */
p->pi_waiters_leftmost_copy = p->pi_waiters_leftmost;
+
+ if (!task_has_pi_waiters(p))
+ return;
+
+ top_waiter = task_top_pi_waiter(p);
+ top_waiter->dl_runtime_copy = top_waiter->dl_runtime;
+ top_waiter->dl_period_copy = top_waiter->dl_period;
}

/*
@@ -632,6 +651,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
+ waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0;
+ waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0;
rt_mutex_enqueue(lock, waiter);

/* [8] Release the task */
@@ -902,6 +923,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
+ waiter->dl_runtime = dl_policy(task->policy) ? task->dl.dl_runtime : 0;
+ waiter->dl_period = dl_policy(task->policy) ? task->dl.dl_period : 0;

/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97e..224aa64 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -932,8 +932,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)

static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct rt_mutex_waiter *top_waiter = rt_mutex_get_top_waiter(p);
struct sched_dl_entity *pi_se = &p->dl;
+ struct sched_dl_entity_fake pi_se_fake;

/*
* Use the scheduling parameters of the top pi-waiter
@@ -941,8 +942,11 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
- pi_se = &pi_task->dl;
+ if (top_waiter && p->dl.dl_boosted && top_waiter->dl_runtime_copy) {
+ BUG_ON(top_waiter->dl_period_copy == 0);
+ pi_se_fake.dl_runtime = top_waiter->dl_runtime_copy;
+ pi_se_fake.dl_period = top_waiter->dl_period_copy;
+ pi_se = (struct sched_dl_entity *)&pi_se_fake;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
--
1.8.3.1