[RFC PATCH 09/13] fs/userfaultfd: use iov_iter for copy/zero

From: Nadav Amit
Date: Sat Nov 28 2020 - 19:51:21 EST


From: Nadav Amit <namit@xxxxxxxxxx>

Use iov_iter for copy and zero ioctls. This is done in preparation to
support a write_iter() interface that would provide similar services as
UFFDIO_COPY/ZERO.

In the case of UFFDIO_ZERO, the iov_iter is not really used for any
purpose other than providing the length of the range that is zeroed.

Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Peter Xu <peterx@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: io-uring@xxxxxxxxxxxxxxx
Cc: linux-fsdevel@xxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx
Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
---
fs/userfaultfd.c | 21 ++++++--
include/linux/hugetlb.h | 4 +-
include/linux/mm.h | 6 +--
include/linux/shmem_fs.h | 2 +-
include/linux/userfaultfd_k.h | 10 ++--
mm/hugetlb.c | 12 +++--
mm/memory.c | 36 ++++++-------
mm/shmem.c | 17 +++----
mm/userfaultfd.c | 96 +++++++++++++++++------------------
9 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index db1a963f6ae2..7bbee2a00d37 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1914,6 +1914,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
+ struct iov_iter iter;
+ struct iovec iov;

user_uffdio_copy = (struct uffdio_copy __user *) arg;

@@ -1940,10 +1942,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
+
+ ret = import_single_range(READ, (__force void __user *)uffdio_copy.src,
+ uffdio_copy.len, &iov, &iter);
+ if (unlikely(ret))
+ return ret;
+
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, &iter,
+ &ctx->mmap_changing, uffdio_copy.mode);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1971,6 +1978,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
struct uffdio_zeropage uffdio_zeropage;
struct uffdio_zeropage __user *user_uffdio_zeropage;
struct userfaultfd_wake_range range;
+ struct iov_iter iter;
+ struct iovec iov;

user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

@@ -1992,10 +2001,12 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
goto out;

+ ret = import_single_range(READ, (__force void __user *)0,
+ uffdio_zeropage.range.len, &iov, &iter);
+
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ &iter, &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ebca2ef02212..2f3452e0bb84 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -137,7 +137,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep);
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
@@ -312,7 +312,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep)
{
BUG();
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..1f183c441d89 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3085,10 +3085,10 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr_hint,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
- const void __user *usr_src,
+extern long copy_huge_page_from_iter(struct page *dst_page,
+ size_t offset, struct iov_iter *iter,
unsigned int pages_per_huge_page,
- bool allow_pagefault);
+ bool atomic);

/**
* vma_is_special_huge - Are transhuge page-table entries considered special?
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a5a5d1d4d7b1..1973bb1c6a10 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -122,7 +122,7 @@ extern void shmem_uncharge(struct inode *inode, long pages);
extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep);
extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a8e5f3ea9bb2..b5c1be67b0d7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -35,12 +35,10 @@ extern int sysctl_unprivileged_userfaultfd;
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- bool *mmap_changing, __u64 mode);
-extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len,
- bool *mmap_changing);
+ struct iov_iter *iter, bool *mmap_changing,
+ __u64 mode);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start,
+ struct iov_iter *iter, bool *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, bool *mmap_changing);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 37f15c3c24dc..3aa779123dfe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4603,7 +4603,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep)
{
struct address_space *mapping;
@@ -4622,13 +4622,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (IS_ERR(page))
goto out;

- ret = copy_huge_page_from_user(page,
- (const void __user *) src_addr,
- pages_per_huge_page(h), false);
+ pagefault_disable();
+
+ ret = copy_huge_page_from_iter(page, 0, iter,
+ pages_per_huge_page(h), true);
+
+ pagefault_enable();

/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
diff --git a/mm/memory.c b/mm/memory.c
index c48f8df6e502..650f1ba19812 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5178,33 +5178,29 @@ void copy_user_huge_page(struct page *dst, struct page *src,
process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}

-long copy_huge_page_from_user(struct page *dst_page,
- const void __user *usr_src,
- unsigned int pages_per_huge_page,
- bool allow_pagefault)
+long copy_huge_page_from_iter(struct page *dst_page,
+ size_t offset,
+ struct iov_iter *iter,
+ unsigned int pages_per_huge_page,
+ bool atomic)
{
- void *src = (void *)usr_src;
- void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ unsigned long of_in_page = offset_in_page(offset);

- for (i = 0; i < pages_per_huge_page; i++) {
- if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
- else
- page_kaddr = kmap_atomic(dst_page + i);
- rc = copy_from_user(page_kaddr,
- (const void __user *)(src + i * PAGE_SIZE),
- PAGE_SIZE);
- if (allow_pagefault)
- kunmap(dst_page + i);
- else
- kunmap_atomic(page_kaddr);
+ for (i = offset / PAGE_SIZE; i < pages_per_huge_page; i++) {
+ size_t bytes = PAGE_SIZE - of_in_page;
+
+ rc = __copy_page_from_iter(dst_page + i, of_in_page,
+ bytes, iter, atomic);

- ret_val -= (PAGE_SIZE - rc);
- if (rc)
+ ret_val -= rc;
+
+ if (rc != bytes)
break;

+ of_in_page = 0;
+
cond_resched();
}
return ret_val;
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..77232cf8bd49 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2361,7 +2361,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
bool zeropage,
struct page **pagep)
{
@@ -2371,7 +2371,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
spinlock_t *ptl;
- void *page_kaddr;
struct page *page;
pte_t _dst_pte, *dst_pte;
int ret;
@@ -2387,18 +2386,14 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
goto out_unacct_blocks;

if (!zeropage) { /* mcopy_atomic */
- page_kaddr = kmap_atomic(page);
- ret = copy_from_user(page_kaddr,
- (const void __user *)src_addr,
- PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ ret = __copy_page_from_iter(page, 0, PAGE_SIZE, iter, true);

/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
shmem_inode_unacct_blocks(inode, 1);
/* don't free the page */
- return -ENOENT;
+ return ret;
}
} else { /* mfill_zeropage_atomic */
clear_highpage(page);
@@ -2484,11 +2479,11 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep)
{
return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
+ dst_addr, iter, false, pagep);
}

int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
@@ -2499,7 +2494,7 @@ int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
struct page *page = NULL;

return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
+ dst_addr, NULL, true, &page);
}

#ifdef CONFIG_TMPFS
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..ee77fb229185 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -15,6 +15,7 @@
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
+#include <linux/uio.h>
#include <asm/tlbflush.h>
#include "internal.h"

@@ -48,17 +49,20 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}

+/*
+ * mcopy_atomic_pte() -Returns error of the number of bytes that were not
+ * copied.
+ */
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **pagep,
bool wp_copy)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
- void *page_kaddr;
int ret;
struct page *page;
pgoff_t offset, max_off;
@@ -70,17 +74,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;

- page_kaddr = kmap_atomic(page);
- ret = copy_from_user(page_kaddr,
- (const void __user *) src_addr,
- PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ ret = __copy_page_from_iter(page, 0, PAGE_SIZE, iter, true);

/* fallback to copy_from_user outside mmap_lock */
- if (unlikely(ret)) {
- ret = -ENOENT;
+ if (unlikely(ret != PAGE_SIZE)) {
*pagep = page;
/* don't free the page */
+ ret = PAGE_SIZE - ret;
goto out;
}
} else {
@@ -205,7 +205,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
- unsigned long src_start,
+ struct iov_iter *iter,
unsigned long len,
bool zeropage)
{
@@ -213,7 +213,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
- unsigned long src_addr, dst_addr;
+ unsigned long dst_addr;
long copied;
struct page *page;
unsigned long vma_hpagesize;
@@ -232,7 +232,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
return -EINVAL;
}

- src_addr = src_start;
dst_addr = dst_start;
copied = 0;
page = NULL;
@@ -272,7 +271,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
goto out_unlock;
}

- while (src_addr < src_start + len) {
+ while (iov_iter_count(iter) > 0) {
pte_t dst_pteval;

BUG_ON(dst_addr >= dst_start + len);
@@ -306,7 +305,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
}

err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, &page);
+ dst_addr, iter, &page);

mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
@@ -314,14 +313,14 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,

cond_resched();

- if (unlikely(err == -ENOENT)) {
+ if (unlikely(err > 0)) {
+ size_t offset = vma_hpagesize - err;
+
mmap_read_unlock(dst_mm);
BUG_ON(!page);

- err = copy_huge_page_from_user(page,
- (const void __user *)src_addr,
- vma_hpagesize / PAGE_SIZE,
- true);
+ err = copy_huge_page_from_iter(page, offset, iter,
+ vma_hpagesize / PAGE_SIZE, false);
if (unlikely(err)) {
err = -EFAULT;
goto out;
@@ -330,12 +329,12 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,

dst_vma = NULL;
goto retry;
- } else
- BUG_ON(page);
+ } else {
+ BUG_ON(err != 0 && page);
+ }

if (!err) {
dst_addr += vma_hpagesize;
- src_addr += vma_hpagesize;
copied += vma_hpagesize;

if (fatal_signal_pending(current))
@@ -415,7 +414,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- unsigned long src_addr,
+ struct iov_iter *iter,
struct page **page,
bool zeropage,
bool wp_copy)
@@ -435,7 +434,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page,
+ dst_addr, iter, page,
wp_copy);
else
err = mfill_zeropage_pte(dst_mm, dst_pmd,
@@ -445,30 +444,34 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
if (!zeropage)
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
dst_vma, dst_addr,
- src_addr, page);
+ iter, page);
else
err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
}

+ if (zeropage && err >= 0)
+ iov_iter_advance(iter, PAGE_SIZE - err);
+
return err;
}

static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
+ struct iov_iter *iter,
bool zeropage,
bool *mmap_changing,
__u64 mode)
{
+ unsigned long len = iov_iter_count(iter);
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
- unsigned long src_addr, dst_addr;
+ unsigned long dst_addr;
long copied;
struct page *page;
bool wp_copy;
+ unsigned long remaining = iov_iter_count(iter);

/*
* Sanitize the command parameters:
@@ -477,10 +480,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
BUG_ON(len & ~PAGE_MASK);

/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
BUG_ON(dst_start + len <= dst_start);

- src_addr = src_start;
dst_addr = dst_start;
copied = 0;
page = NULL;
@@ -527,7 +528,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ iter, len, zeropage);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -542,10 +543,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;

- while (src_addr < src_start + len) {
+ while (remaining > 0) {
pmd_t dst_pmdval;

- BUG_ON(dst_addr >= dst_start + len);
+ BUG_ON(dst_addr >= dst_start + remaining);

dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -577,31 +578,29 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
BUG_ON(pmd_trans_huge(*dst_pmd));

err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ iter, &page, zeropage, wp_copy);
cond_resched();

- if (unlikely(err == -ENOENT)) {
- void *page_kaddr;
+ if (unlikely(err > 0)) {
+ size_t bytes = err;
+ size_t offset = PAGE_SIZE - bytes;

mmap_read_unlock(dst_mm);
BUG_ON(!page);

- page_kaddr = kmap(page);
- err = copy_from_user(page_kaddr,
- (const void __user *) src_addr,
- PAGE_SIZE);
- kunmap(page);
+ err = copy_page_from_iter(page, offset, bytes, iter);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
goto retry;
} else
- BUG_ON(page);
+ BUG_ON(err != 0 && page);
+
+ remaining = iov_iter_count(iter);

if (!err) {
dst_addr += PAGE_SIZE;
- src_addr += PAGE_SIZE;
copied += PAGE_SIZE;

if (fatal_signal_pending(current))
@@ -623,17 +622,16 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
}

ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- bool *mmap_changing, __u64 mode)
+ struct iov_iter *iter, bool *mmap_changing, __u64 mode)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing, mode);
+ return __mcopy_atomic(dst_mm, dst_start, iter, false, mmap_changing,
+ mode);
}

ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool *mmap_changing)
+ struct iov_iter *iter, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+ return __mcopy_atomic(dst_mm, start, iter, true, mmap_changing, 0);
}

int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
--
2.25.1