[PATCH v2 3/3] userfaultfd: use per-vma locks in userfaultfd operations

From: Lokesh Gidra
Date: Mon Jan 29 2024 - 14:36:45 EST


All userfaultfd operations, except write-protect, opportunistically use
per-vma locks to lock vmas. If we fail then fall back to locking
mmap-lock in read-mode.

Write-protect operation requires mmap_lock as it iterates over multiple vmas.

Signed-off-by: Lokesh Gidra <lokeshgidra@xxxxxxxxxx>
---
fs/userfaultfd.c | 13 +--
include/linux/userfaultfd_k.h | 5 +-
mm/userfaultfd.c | 175 +++++++++++++++++++++++-----------
3 files changed, 122 insertions(+), 71 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c00a021bcce4..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
return -EINVAL;

if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
-
- /* Re-check after taking map_changing_lock */
- down_read(&ctx->map_changing_lock);
- if (likely(!atomic_read(&ctx->mmap_changing)))
- ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
- uffdio_move.len, uffdio_move.mode);
- else
- ret = -EAGAIN;
- up_read(&ctx->map_changing_lock);
- mmap_read_unlock(mm);
+ ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
mmput(mm);
} else {
return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3210c3552976..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma,
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 flags);
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 6e2ca04ab04d..d55bf18b80db 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -19,20 +19,39 @@
#include <asm/tlb.h>
#include "internal.h"

-static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
+{
+ BUG_ON(!vma && !*mmap_locked);
+
+ if (*mmap_locked) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ } else
+ vma_end_read(vma);
+}
+
+/*
+ * Search for VMA and make sure it is stable either by locking it or taking
+ * mmap_lock.
+ */
+struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len,
+ bool *mmap_locked)
{
+ struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
+ if (!dst_vma) {
+ mmap_read_lock(dst_mm);
+ *mmap_locked = true;
+ dst_vma = find_vma(dst_mm, dst_start);
+ }
+
/*
* Make sure that the dst range is both valid and fully within a
* single existing vma.
*/
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
if (!range_in_vma(dst_vma, dst_start, dst_start + len))
- return NULL;
+ goto unpin;

/*
* Check the vma is registered in uffd, this is required to
@@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ goto unpin;

return dst_vma;
+
+unpin:
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
+ return NULL;
}

/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
@@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
struct userfaultfd_ctx *ctx,
@@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- uffd_flags_t flags)
+ uffd_flags_t flags,
+ bool *mmap_locked)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
return -EINVAL;
}

@@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
+ len, mmap_locked);
+ if (!dst_vma)
+ goto out;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;

err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+ goto out_unlock_vma;
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(

if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
BUG_ON(!folio);

err = copy_folio_from_user(folio,
@@ -474,17 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
- down_read(&ctx->map_changing_lock);
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- if (atomic_read(ctx->mmap_changing)) {
- err = -EAGAIN;
- break;
- }

dst_vma = NULL;
goto retry;
@@ -505,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(

out_unlock:
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+out_unlock_vma:
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
out:
if (folio)
folio_put(folio);
@@ -521,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- uffd_flags_t flags);
+ uffd_flags_t flags,
+ bool *mmap_locked);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
@@ -581,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long src_addr, dst_addr;
long copied;
struct folio *folio;
+ bool mmap_locked = false;

/*
* Sanitize the command parameters:
@@ -597,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
copied = 0;
folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -ENOENT;
+ dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
+ if (!dst_vma)
+ goto out;

/*
* If memory mappings are changing because of non-cooperative
@@ -609,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
- goto out_unlock;
-
err = -EINVAL;
/*
* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -638,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
+ len, flags, &mmap_locked);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -699,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
void *kaddr;

up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, &mmap_locked);
+
BUG_ON(!folio);

kaddr = kmap_local_folio(folio, 0);
@@ -730,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,

out_unlock:
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, &mmap_locked);
out:
if (folio)
folio_put(folio);
@@ -1285,8 +1314,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* @len: length of the virtual memory range
* @mode: flags from uffdio_move.mode
*
- * Must be called with mmap_lock held for read.
- *
* move_pages() remaps arbitrary anonymous pages atomically in zero
* copy. It only works on non shared anonymous pages because those can
* be relocated without generating non linear anon_vmas in the rmap
@@ -1353,15 +1380,16 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* could be obtained. This is the only additional complexity added to
* the rmap code to provide this anonymous page remapping functionality.
*/
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 mode)
{
+ struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
unsigned long src_addr, dst_addr;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
+ bool mmap_locked = false;

/* Sanitize the command parameters. */
if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
@@ -1374,28 +1402,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
WARN_ON_ONCE(dst_start + len <= dst_start))
goto out;

+ dst_vma = NULL;
+ src_vma = lock_vma_under_rcu(mm, src_start);
+ if (src_vma) {
+ dst_vma = lock_vma_under_rcu(mm, dst_start);
+ if (!dst_vma)
+ vma_end_read(src_vma);
+ }
+
+ /* If we failed to lock both VMAs, fall back to mmap_lock */
+ if (!dst_vma) {
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ src_vma = find_vma(mm, src_start);
+ if (!src_vma)
+ goto out_unlock_mmap;
+ dst_vma = find_vma(mm, dst_start);
+ if (!dst_vma)
+ goto out_unlock_mmap;
+ }
+
+ /* Re-check after taking map_changing_lock */
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing))) {
+ err = -EAGAIN;
+ goto out_unlock;
+ }
/*
* Make sure the vma is not shared, that the src and dst remap
* ranges are both valid and fully within a single existing
* vma.
*/
- src_vma = find_vma(mm, src_start);
- if (!src_vma || (src_vma->vm_flags & VM_SHARED))
- goto out;
+ if (src_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
if (src_start < src_vma->vm_start ||
src_start + len > src_vma->vm_end)
- goto out;
+ goto out_unlock;

- dst_vma = find_vma(mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out;
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
if (dst_start < dst_vma->vm_start ||
dst_start + len > dst_vma->vm_end)
- goto out;
+ goto out_unlock;

err = validate_move_areas(ctx, src_vma, dst_vma);
if (err)
- goto out;
+ goto out_unlock;

for (src_addr = src_start, dst_addr = dst_start;
src_addr < src_start + len;) {
@@ -1512,6 +1564,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
moved += step_size;
}

+out_unlock:
+ up_read(&ctx->map_changing_lock);
+out_unlock_mmap:
+ if (mmap_locked)
+ mmap_read_unlock(mm);
+ else {
+ vma_end_read(dst_vma);
+ vma_end_read(src_vma);
+ }
out:
VM_WARN_ON(moved < 0);
VM_WARN_ON(err > 0);
--
2.43.0.429.g432eaa2c6b-goog