Re: [PATCH v3 4/4] futex: Avoid taking hb lock if nothing to wakeup

From: Davidlohr Bueso
Date: Fri Dec 20 2013 - 20:36:18 EST


On Thu, 2013-12-19 at 15:14 -0800, Linus Torvalds wrote:
> On Thu, Dec 19, 2013 at 10:45 AM, Davidlohr Bueso <davidlohr@xxxxxx> wrote:
> >
> > - increment the counter at queue_lock() as we always end up calling
> > queue_me() which adds the element to the list. Upon any error,
> > queue_unlock() is called for housekeeping, for which we decrement
> > to mach the increment done in queue_lock().
> >
> > - decrement the counter at __unqueue_me() to reflect when an element is
> > removed from the queue for wakeup related purposes.
>
> I still hate this whole separate counter thing. It seems really annoying.
>
> If re-ordering things didn't work out, then why can't just the counter
> we *already* have in the spinlock itself work as the counter? Your
> counter update logic seems to basically match when you take the
> spinlock anyway.

So the following has passed all testing, just like the atomics variant.
Thoughts?

Thanks,
Davidlohr

diff --git a/kernel/futex.c b/kernel/futex.c
index fcc6850..c8c7ce5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -73,19 +73,22 @@
* Basic futex operation and ordering guarantees:
*
* The waiter reads the futex value in user space and calls
- * futex_wait(). This function computes the hash bucket and acquires
- * the hash bucket lock. After that it reads the futex user space value
- * again and verifies that the data has not changed. If it has not
- * changed it enqueues itself into the hash bucket, releases the hash
+ * futex_wait(). It computes the hash bucket and acquires the hash
+ * bucket lock. After that it reads the futex user space value again
+ * and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash
* bucket lock and schedules.
*
* The waker side modifies the user space value of the futex and calls
- * futex_wake(). This functions computes the hash bucket and acquires
- * the hash bucket lock. Then it looks for waiters on that futex in the
- * hash bucket and wakes them.
+ * futex_wake(). It computes the hash bucket and acquires the hash
+ * bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
*
- * Note that the spin_lock serializes waiters and wakers, so that the
- * following scenario is avoided:
+ * In scenarios where wakeups are called and no tasks are blocked on a futex,
+ * taking the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
*
* CPU 0 CPU 1
* val = *futex;
@@ -106,24 +109,50 @@
* This would cause the waiter on CPU 0 to wait forever because it
* missed the transition of the user space value from val to newval
* and the waker did not find the waiter in the hash bucket queue.
- * The spinlock serializes that:
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
*
* CPU 0 CPU 1
* val = *futex;
* sys_futex(WAIT, futex, val);
* futex_wait(futex, val);
- * lock(hash_bucket(futex));
- * uval = *futex;
- * *futex = newval;
- * sys_futex(WAKE, futex);
- * futex_wake(futex);
- * lock(hash_bucket(futex));
+ *
+ * waiters++;
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
* if (uval == val)
- * queue();
+ * queue();
* unlock(hash_bucket(futex));
- * schedule(); if (!queue_empty())
- * wake_waiters(futex);
- * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * wake_waiters(futex);
+ * unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read -- this
+ * is guaranteed by the head counter in the hb spinlock; and where (B)
+ * orders the write to futex and the waiters read.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
*/

int __read_mostly futex_cmpxchg_enabled;
@@ -211,6 +240,35 @@ static unsigned long __read_mostly futex_hashsize;

static struct futex_hash_bucket *futex_queues;

+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+#ifdef CONFIG_SMP
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic_inc();
+#endif
+}
+
+static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ /*
+ * If the hash bucket is locked then we know the ticket counter
+ * is non-zero and thus there is at least one waiter in the queue.
+ */
+ if (spin_is_locked(&hb->lock))
+ return true;
+ smp_rmb(); /* Make sure we check the lock state first */
+ return !plist_head_empty(&hb->chain);
+#else
+ return true;
+#endif
+}
+
/*
* We hash on the keys returned from get_futex_key (see below).
*/
@@ -245,10 +303,10 @@ static void get_futex_key_refs(union futex_key *key)

switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode);
+ ihold(key->shared.inode); /* implies MB (B) */
break;
case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
+ futex_get_mm(key); /* implies MB (B) */
break;
}
}
@@ -322,7 +380,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
return 0;
}

@@ -1052,6 +1110,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
goto out;

hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
spin_lock(&hb->lock);

plist_for_each_entry_safe(this, next, &hb->chain, list) {
@@ -1072,6 +1135,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
}

spin_unlock(&hb->lock);
+out_put_key:
put_futex_key(&key);
out:
return ret;
@@ -1535,7 +1599,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;

- spin_lock(&hb->lock);
+ spin_lock(&hb->lock); /* implies MB (A) */
return hb;
}



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/