[RFC PATCH 1/2] futex: rely on get_user_pages() for shared futexes

From: Peter Zijlstra
Date: Fri Apr 04 2008 - 15:39:01 EST


On the way of getting rid of the mmap_sem requirement for shared futexes,
start by relying on get_user_pages().

This requires we get the page associated with the key, and put the page when
we're done with it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
include/linux/futex.h | 12 ++--
kernel/futex.c | 122 ++++++++++++++++++++------------------------------
2 files changed, 55 insertions(+), 79 deletions(-)

Index: linux-2.6/include/linux/futex.h
===================================================================
--- linux-2.6.orig/include/linux/futex.h
+++ linux-2.6/include/linux/futex.h
@@ -124,18 +124,14 @@ handle_futex_death(u32 __user *uaddr, st
* 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
* (no reference on an inode or mm)
* 01 : Shared futex (PTHREAD_PROCESS_SHARED)
- * mapped on a file (reference on the underlying inode)
- * 10 : Shared futex (PTHREAD_PROCESS_SHARED)
- * (but private mapping on an mm, and reference taken on it)
-*/
+ */

-#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+#define FUT_OFF_PAGE 1

union futex_key {
struct {
unsigned long pgoff;
- struct inode *inode;
+ struct page *page;
int offset;
} shared;
struct {
@@ -150,6 +146,8 @@ union futex_key {
} both;
};

+#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
+
#ifdef CONFIG_FUTEX
extern void exit_robust_list(struct task_struct *curr);
extern void exit_pi_state_list(struct task_struct *curr);
Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -190,7 +190,6 @@ static int get_futex_key(u32 __user *uad
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
struct page *page;
int err;

@@ -202,6 +201,9 @@ static int get_futex_key(u32 __user *uad
return -EINVAL;
address -= key->both.offset;

+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -210,67 +212,37 @@ static int get_futex_key(u32 __user *uad
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
key->private.mm = mm;
key->private.address = address;
return 0;
}
- /*
- * The futex is hashed differently depending on whether
- * it's in a shared or private mapping. So check vma first.
- */
- vma = find_extend_vma(mm, address);
- if (unlikely(!vma))
- return -EFAULT;

- /*
- * Permissions.
- */
- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
- return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+ if (err < 0)
+ return err;
+
+ key->shared.page = page;
+ key->both.offset |= FUT_OFF_PAGE;

/*
- * Private mappings are handled in a simple way.
- *
- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
- * it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process. Therefore we use
- * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
- * mappings of _writable_ handles.
+ * doesn't really matter anyway, as we'll end up finding the
+ * same page again
*/
- if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
- key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
- key->private.mm = mm;
+ if (PageAnon(page))
key->private.address = address;
- return 0;
- }
+ else
+ key->shared.pgoff = page->index;

- /*
- * Linear file mappings are also simple.
- */
- key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
- if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
- + vma->vm_pgoff);
- return 0;
- }
+ return 0;
+}

- /*
- * We could walk the page table to read the non-linear
- * pte, and get the page index without fetching the page
- * from swap. But that's a lot of code to duplicate here
- * for a rare case, so we simply fetch the page.
- */
- err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
- if (err >= 0) {
- key->shared.pgoff =
- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- put_page(page);
- return 0;
- }
- return err;
+static void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
+{
+ if (!key->both.ptr)
+ return;
+
+ if (key->both.offset & FUT_OFF_PAGE)
+ put_page(key->shared.page);
}

/*
@@ -280,16 +252,13 @@ static int get_futex_key(u32 __user *uad
*/
static void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr == 0)
+ if (!key->both.ptr)
return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
- break;
- case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
- break;
- }
+
+ if (key->both.offset & FUT_OFF_PAGE)
+ get_page(key->shared.page);
+ else
+ atomic_inc(&key->private.mm->mm_count);
}

/*
@@ -300,14 +269,11 @@ static void drop_futex_key_refs(union fu
{
if (!key->both.ptr)
return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- iput(key->shared.inode);
- break;
- case FUT_OFF_MMSHARED:
- mmdrop(key->private.mm);
- break;
- }
+
+ if (key->both.offset & FUT_OFF_PAGE)
+ put_page(key->shared.page);
+ else
+ mmdrop(key->private.mm);
}

static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -733,7 +699,7 @@ static int futex_wake(u32 __user *uaddr,
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret;

futex_lock_mm(fshared);
@@ -760,6 +726,7 @@ static int futex_wake(u32 __user *uaddr,

spin_unlock(&hb->lock);
out:
+ put_futex_key(fshared, &key);
futex_unlock_mm(fshared);
return ret;
}
@@ -773,7 +740,7 @@ futex_wake_op(u32 __user *uaddr1, struct
u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
@@ -873,6 +840,8 @@ retry:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
futex_unlock_mm(fshared);

return ret;
@@ -886,7 +855,7 @@ static int futex_requeue(u32 __user *uad
u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
@@ -978,6 +947,8 @@ out_unlock:
drop_futex_key_refs(&key1);

out:
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
futex_unlock_mm(fshared);
return ret;
}
@@ -1185,6 +1156,7 @@ static int futex_wait(u32 __user *uaddr,
retry:
futex_lock_mm(fshared);

+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
@@ -1324,6 +1296,7 @@ static int futex_wait(u32 __user *uaddr,
queue_unlock(&q, hb);

out_release_sem:
+ put_futex_key(fshared, &q.key);
futex_unlock_mm(fshared);
return ret;
}
@@ -1373,6 +1346,7 @@ static int futex_lock_pi(u32 __user *uad
retry:
futex_lock_mm(fshared);

+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
@@ -1587,6 +1561,7 @@ static int futex_lock_pi(u32 __user *uad
queue_unlock(&q, hb);

out_release_sem:
+ put_futex_key(fshared, &q.key);
futex_unlock_mm(fshared);
return ret;

@@ -1629,7 +1604,7 @@ static int futex_unlock_pi(u32 __user *u
struct futex_q *this, *next;
u32 uval;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret, attempt = 0;

retry:
@@ -1702,6 +1677,7 @@ retry_unlocked:
out_unlock:
spin_unlock(&hb->lock);
out:
+ put_futex_key(fshared, &key);
futex_unlock_mm(fshared);

return ret;
@@ -1822,6 +1798,7 @@ static int futex_fd(u32 __user *uaddr, i

fshared = &current->mm->mmap_sem;
down_read(fshared);
+ q->key = FUTEX_KEY_INIT;
err = get_futex_key(uaddr, fshared, &q->key);

if (unlikely(err != 0)) {
@@ -1837,6 +1814,7 @@ static int futex_fd(u32 __user *uaddr, i
filp->private_data = q;

queue_me(q, ret, filp);
+ put_futex_key(fshared, &q->key);
up_read(fshared);

/* Now we map fd to filp, so userspace can access it */

--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/