Re: [PATCH -v5 07/14] futex: Change locking rules

From: Sebastian Andrzej Siewior
Date: Tue Mar 07 2017 - 12:12:39 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Sat, 4 Mar 2017 10:27:24 +0100
Subject: [PATCH] futex: Change locking rules

Currently futex-pi relies on hb->lock to serialize everything. Since
hb->lock is giving us problems (PI inversions among other things,
since on -rt hb lock itself is a rt_mutex), we want to break this up a
bit.

This patch reworks and documents the locking. Notably, it
consistently uses rt_mutex::wait_lock to serialize {uval, pi_state}.
This would allow us to do rt_mutex_unlock() (including deboost)
without holding hb->lock.

Nothing yet relies on the new locking rules.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
On 2017-03-07 14:22:14 [+0100], Thomas Gleixner wrote:
> Both 'return' statements leak &pi_state->pi_mutex.wait_lock ....

this has unlock in both 'return's.

kernel/futex.c | 161 +++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 129 insertions(+), 32 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index af022919933a..eda79e6c4e45 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_struct *curr)
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/

/*
@@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;

/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1026,34 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;

+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));

/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}

/*
@@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}

/*
@@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
+ goto out_einval;
+
+out_attach:
atomic_inc(&pi_state->refcount);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}

/*
@@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,

/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();

@@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}

-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* attach to the pi_state when the validation succeeds.
*/
if (top_waiter)
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);

/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;

- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}

@@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
*/
top_waiter = futex_top_waiter(hb, key);
if (top_waiter)
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);

/*
* No waiter and user TID is 0. We are here because the
@@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter

if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
else
ret = -EINVAL;
}
+
if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1823,7 +1907,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}

switch (ret) {
@@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;

+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
@@ -2166,45 +2252,56 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}

pi_state->owner = newowner;

- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;

/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);

ret = fault_in_user_writeable(uaddr);

spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);

/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
+ if (pi_state->owner != oldowner) {
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return 0;
+ }

- if (ret)
+ if (ret) {
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
+ }

goto retry;
}
--
2.11.0

Sebastian