[PATCH] futex: Support smaller futexes of one byte or two byte size.

From: Malte Skarupke
Date: Fri Dec 20 2019 - 14:10:09 EST


This is option 1 out of 2. In both versions only the operations WAIT, WAKE,
REQUEUE and CMP_REQUEUE are supported. In this version WAIT and CMP_REQUEUE
take a size argument, WAKE and REQUEUE do not and work on futexes of all
sizes. The other option requires a size for those operations, too. See the
mailing list for discussions as to which option we should use.

None of these operations handle overlapping futexes. The mental model is
comparison+hashtable lookup and only an exact match on the address will find
the matching futex between WAIT and WAKE. An alternative would have been the
MONITOR/MWAIT mental model to handle overlapping futexes of different sizes,
but for mutexes and condition variables I just needed the normal futex
behavior. (except with smaller types)

There are two reasons for adding smaller futexes:
1. People often want smaller mutexes. Having mutexes that are only one byte in
size was the first reason WebKit gave for re-implementing futexes:
https://webkit.org/blog/6161/locking-in-webkit/

2. The C++ standard added futexes to the standard library in C++20 under the
name atomic_wait and atomic_notify:
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1135r5.html
The C++ version supports this for atomic variables of any size. The more sizes
we can support, the better the standard implementation can be on Linux.

I changed the location of two flags that used to be stored in the low bits of
the address, since the address is no longer aligned on four bytes. Luckily the
high bits were free as long as PAGE_SIZE is never more than 1 gigabyte.

The reason for only supporting 8 bit futexes and 16 bit futexes is that those
were easy to support with the existing interface. 64 bit futexes would require
bigger changes.

The reason for only supporting WAIT/WAKE and REQUEUE/CMP_REQUEUE is that those
are the only operations I need right now. (in fact the C++ standard only needs
WAIT/WAKE) The other operations are more complicated and will have to wait.

Signed-off-by: Malte Skarupke <malteskarupke@xxxxxx>
---
include/linux/futex.h | 10 ++-
include/uapi/linux/futex.h | 7 +-
kernel/futex.c | 155 +++++++++++++++++++++++++++++++------
3 files changed, 145 insertions(+), 27 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 5cc3fed27d4c..3655e026c914 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -16,8 +16,8 @@ struct task_struct;
* The key type depends on whether it's a shared or private mapping.
* Don't rearrange members without looking at hash_futex().
*
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We use the two low order bits of offset to tell what is the kind of key :
+ * offset is the position within a page and is in the range [0, PAGE_SIZE).
+ * The high bits of offset indicate what kind of key this is :
* 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
* (no reference on an inode or mm)
* 01 : Shared futex (PTHREAD_PROCESS_SHARED)
@@ -26,8 +26,10 @@ struct task_struct;
* (but private mapping on an mm, and reference taken on it)
*/

-#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+/* We set the next highest bit if key has a reference on inode */
+#define FUT_OFF_INODE PAGE_SIZE
+/* We set the bit after that if key has a reference on mm */
+#define FUT_OFF_MMSHARED (PAGE_SIZE << 1)

union futex_key {
struct {
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..1121603c5b64 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -24,7 +24,12 @@

#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
-#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_SIZE_8_BITS 512
+#define FUTEX_SIZE_16_BITS 1024
+#define FUTEX_SIZE_32_BITS 0
+#define FUTEX_ALL_SIZE_BITS (FUTEX_SIZE_8_BITS | FUTEX_SIZE_16_BITS)
+#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+ FUTEX_ALL_SIZE_BITS)

#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..0e55295fa23e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -183,6 +183,9 @@ static int __read_mostly futex_cmpxchg_enabled;
#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
+#define FLAGS_8_BITS 0x08
+#define FLAGS_16_BITS 0x10
+#define FLAGS_SIZE_BITS (FLAGS_8_BITS | FLAGS_16_BITS)

/*
* Priority Inheritance state:
@@ -473,7 +476,14 @@ static void drop_futex_key_refs(union futex_key *key)

enum futex_access {
FUTEX_READ,
- FUTEX_WRITE
+ FUTEX_WRITE,
+ /*
+ * for operations that only need the address and don't touch the
+ * memory, like FUTEX_WAKE or FUTEX_REQUEUE. (not FUTEX_CMP_REQUEUE)
+ * this will skip the size checks of the futex, allowing those
+ * operations to be used with futexes of any size.
+ */
+ FUTEX_NONE
};

/**
@@ -505,10 +515,20 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
return timeout;
}

+static inline int futex_size(int flags)
+{
+ if (flags & FLAGS_8_BITS)
+ return sizeof(u8);
+ else if (flags & FLAGS_16_BITS)
+ return sizeof(u16);
+ else
+ return sizeof(u32);
+}
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_8_BITS etc.)
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
@@ -524,23 +544,29 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *tail;
struct address_space *mapping;
- int err, ro = 0;
+ int err, fshared, size, ro = 0;
+
+ fshared = flags & FLAGS_SHARED;
+ if (rw == FUTEX_NONE)
+ size = 1;
+ else
+ size = futex_size(flags);

/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address & (size - 1)) != 0))
return -EINVAL;
address -= key->both.offset;

- if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;

if (unlikely(should_fail_futex(fshared)))
@@ -570,7 +596,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
*/
- if (err == -EFAULT && rw == FUTEX_READ) {
+ if (err == -EFAULT && rw != FUTEX_WRITE) {
err = get_user_pages_fast(address, 1, 0, &page);
ro = 1;
}
@@ -792,6 +818,18 @@ static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
return ret;
}

+static inline int __get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+ switch (size) {
+ case 1:
+ return __get_user(*dest, (u8 __user *)from);
+ case 2:
+ return __get_user(*dest, (u16 __user *)from);
+ default:
+ return __get_user(*dest, from);
+ }
+}
+
static int get_futex_value_locked(u32 *dest, u32 __user *from)
{
int ret;
@@ -803,6 +841,26 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
return ret ? -EFAULT : 0;
}

+static int get_sized_futex_value_locked(u32 *dest, u32 __user *from, int size)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __get_sized_futex_value(dest, from, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+static int get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+ might_fault();
+ if (access_ok(from, size))
+ return __get_sized_futex_value(dest, from, size);
+ else
+ return -EFAULT;
+}
+

/*
* PI code:
@@ -1663,7 +1721,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;

- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_NONE);
if (unlikely(ret != 0))
goto out;

@@ -1762,10 +1820,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
DEFINE_WAKE_Q(wake_q);

retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;

@@ -2047,11 +2105,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}

retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags, &key1,
+ cmpval ? FUTEX_READ : FUTEX_NONE);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ ret = get_futex_key(uaddr2, flags, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_NONE);
if (unlikely(ret != 0))
goto out_put_key1;

@@ -2073,14 +2132,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,

if (likely(cmpval != NULL)) {
u32 curval;
+ int size = futex_size(flags);

- ret = get_futex_value_locked(&curval, uaddr1);
+ ret = get_sized_futex_value_locked(&curval, uaddr1, size);

if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);

- ret = get_user(curval, uaddr1);
+ ret = get_sized_futex_value(&curval, uaddr1, size);
if (ret)
goto out_put_keys;

@@ -2727,7 +2787,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
- int ret;
+ int ret, size = futex_size(flags);

/*
* Access the page AFTER the hash-bucket is locked.
@@ -2748,19 +2808,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;

retry_private:
*hb = queue_lock(q);

- ret = get_futex_value_locked(&uval, uaddr);
+ ret = get_sized_futex_value_locked(&uval, uaddr, size);

if (ret) {
queue_unlock(*hb);

- ret = get_user(uval, uaddr);
+ ret = get_sized_futex_value(&uval, uaddr, size);
if (ret)
goto out;

@@ -2893,7 +2953,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);

retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;

@@ -3084,7 +3144,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;

- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
if (ret)
return ret;

@@ -3321,7 +3381,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
rt_mutex_init_waiter(&rt_waiter);

- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;

@@ -3847,6 +3907,13 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}

+int futex_op_to_size_flags(int op)
+{
+ int size_flags = op & FUTEX_ALL_SIZE_BITS;
+
+ return (size_flags / FUTEX_SIZE_8_BITS) * FLAGS_8_BITS;
+}
+
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
@@ -3862,6 +3929,50 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
+ if (op & FUTEX_ALL_SIZE_BITS) {
+ flags |= futex_op_to_size_flags(op);
+ if ((flags & FLAGS_SIZE_BITS) == FLAGS_SIZE_BITS) {
+ /*
+ * if both bits are set we could silently treat that as
+ * a 32 bit futex, but we may want to use this pattern
+ * in the future to indicate a 64 bit futex or an
+ * arbitrarily sized futex. so we don't want code
+ * relying on this yet.
+ */
+ return -EINVAL;
+ }
+ switch (cmd) {
+ case FUTEX_CMP_REQUEUE:
+ /*
+ * for cmp_requeue, only uaddr has to be of the size
+ * passed in the flags. uaddr2 can be of any size
+ */
+ case FUTEX_WAIT:
+ case FUTEX_WAIT_BITSET:
+ break;
+ case FUTEX_WAKE:
+ case FUTEX_WAKE_BITSET:
+ case FUTEX_REQUEUE:
+ /*
+ * these instructions work with sized futexes, but you
+ * don't need to pass the size. we could silently
+ * ignore the size argument, but the code won't verify
+ * that the correct size is used, so it's preferable
+ * to make that clear to the caller.
+ *
+ * for requeue the meaning would also be ambiguous: do
+ * both of them have to be the same size or not? they
+ * don't, and that's clearer when you just don't pass
+ * a size argument.
+ */
+ return -EINVAL;
+ default:
+ /*
+ * all other cases are not supported yet
+ */
+ return -EINVAL;
+ }
+ }

switch (cmd) {
case FUTEX_LOCK_PI:
--
2.17.1