Hi,
By doing the futex value update atomically with the kernel's inspection
of it to decide to wait, we avoid the time window where the futex has
been set to the 'please wake me up' state, but the thread has not been
queued onto the hash bucket yet. This has two effects:
- Avoids a futex syscall with the FUTEX_WAKE operation if there is no
thread to be woken yet
- In the heavily contended case, avoids waking an extra thread that's
only likely to make the contention problem worse.
Signed-off-by: Michel Lespinasse <walken@xxxxxxxxxx>
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1e5a26d..c5e887d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -20,6 +20,7 @@
#define FUTEX_WAKE_BITSET 10
#define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12
+#define FUTEX_SET_WAIT 13
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
@@ -39,6 +40,7 @@
FUTEX_PRIVATE_FLAG)
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
FUTEX_PRIVATE_FLAG)
+#define FUTEX_SET_WAIT_PRIVATE (FUTEX_SET_WAIT | FUTEX_PRIVATE_FLAG)
/*
* Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index a8cc4e1..a199606 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -25,6 +25,7 @@ struct restart_block {
struct {
u32 *uaddr;
u32 val;
+ u32 val2;
@@ -1722,52 +1723,61 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &q->key,
+ (val == val2) ? VERIFY_READ : VERIFY_WRITE);
if (unlikely(ret != 0))
return ret;
retry_private:
*hb = queue_lock(q);
- ret = get_futex_value_locked(&uval, uaddr);
-
- if (ret) {
+ pagefault_disable();
+ if (unlikely(__copy_from_user_inatomic(&uval, uaddr, sizeof(u32)))) {
+ pagefault_enable();
queue_unlock(q, *hb);
-
ret = get_user(uval, uaddr);
+ fault_common:
if (ret)
goto out;
-
if (!fshared)
goto retry_private;
-
put_futex_key(fshared, &q->key);
goto retry;
}
-
- if (uval != val) {
- queue_unlock(q, *hb);
- ret = -EWOULDBLOCK;
+ if (val != val2 && uval == val) {
+ uval = futex_atomic_cmpxchg_inatomic(uaddr, val, val2);
+ if (unlikely(uval == -EFAULT)) {
+ pagefault_enable();
+ queue_unlock(q, *hb);
+ ret = fault_in_user_writeable(uaddr);
+ goto fault_common;
+ }
}
+ pagefault_enable();
+
+ if (uval == val || uval == val2)
+ return 0; /* success */