Re: PI futexes + lock stealing woes

From: Peter Zijlstra
Date: Thu Dec 07 2017 - 09:27:03 EST


On Thu, Dec 07, 2017 at 11:45:16AM +0100, Peter Zijlstra wrote:
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 76ed5921117a..8ad5221fbd84 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2303,14 +2303,35 @@ static void unqueue_me_pi(struct futex_q *q)
> static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
> struct task_struct *newowner)
> {
> - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
> struct futex_pi_state *pi_state = q->pi_state;
> u32 uval, uninitialized_var(curval), newval;
> struct task_struct *oldowner;
> + u32 newtid;
> int ret;
>
> raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
>
> + if (!newowner) {
> + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
> + ret = 0;
> + goto out_unlock;
> + }
> +
> + newowner = rt_mutex_owner(&pi_state->pi_mutex);
> + if (WARN_ON_ONCE(!newowner)) {
> + /*
> + * We just attempted a trylock; since that failed there
> + * must be an owner, right?
> + */
> + ret = -EFUCKED; /* XXX: check return paths */
> + goto out_unlock;
> + }
> +
> + /* OK we have a newowner, fixup uval */
> + }
> +
> + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
> +
> oldowner = pi_state->owner;
> /* Owner died? */
> if (!pi_state->owner)

OK, this is broken because it needs to be inside the retry label,
because we drop the locks when we take a fault. And if we drop the lock,
the rt_mutex owner can change etc..

> @@ -2443,6 +2464,16 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
> goto out;
> }
>
> + /*
> + * If we didn't get the lock; check if nobody stole it from us.
> + * In that case, we need to fix up the uval to point to them
> + * instead of us, otherwise bad things happen.
> + */
> + if (q->pi_state->owner == current) {
> + ret = fixup_pi_state_owner(uaddr, q, NULL);
> + goto out;
> + }

And this had me read the comment just above here that stated the
unlocked access of ->owner is fine, because only the rt_mutex owner
changes pi_state, except of course, this just changed that.

So now we need double check the state after we've taken the lock in
fixup_pi_state_owner().


The below compiles and boots, but is otherwise untested. Could you give
it a spin?

---
kernel/futex.c | 83 +++++++++++++++++++++++++++++++++--------
kernel/locking/rtmutex.c | 26 +++++++++----
kernel/locking/rtmutex_common.h | 1 +
3 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 76ed5921117a..29ac5b64e7c7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2294,21 +2294,17 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}

-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;

+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);

oldowner = pi_state->owner;
@@ -2317,11 +2313,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
newtid |= FUTEX_OWNER_DIED;

/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
*
- * We have to replace the newowner TID in the user space variable.
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;

@@ -2434,15 +2472,28 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}

+ /*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
/*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}

+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);

- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);

raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}

+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);

extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);

extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,