Re: [PATCH v3 7/9] userfaultfd: add UFFDIO_CONTINUE ioctl

From: Lokesh Gidra
Date: Mon Feb 01 2021 - 17:42:56 EST


On Thu, Jan 28, 2021 at 2:48 PM Axel Rasmussen <axelrasmussen@xxxxxxxxxx> wrote:
>
> This ioctl is how userspace ought to resolve "minor" userfaults. The
> idea is, userspace is notified that a minor fault has occurred. It might
> change the contents of the page using its second non-UFFD mapping, or
> not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured
> the page contents are correct, carry on setting up the mapping".
>
> Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
> MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in
> the minor fault case, we already have some pre-existing underlying page.
> Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
> We'd just use memcpy() or similar instead.
>
> It turns out hugetlb_mcopy_atomic_pte() already does very close to what
> we want, if an existing page is provided via `struct page **pagep`. We
> already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
> just extend that design: add an enum for the three modes of operation,
> and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
> case. (Basically, look up the existing page, and avoid adding the
> existing page to the page cache or calling set_page_huge_active() on
> it.)
>
> Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
> ---
> fs/userfaultfd.c | 67 +++++++++++++++++++++++++++++++
> include/linux/hugetlb.h | 3 ++
> include/linux/userfaultfd_k.h | 18 +++++++++
> include/uapi/linux/userfaultfd.h | 21 +++++++++-
> mm/hugetlb.c | 26 +++++++-----
> mm/userfaultfd.c | 69 +++++++++++++++++++++-----------
> 6 files changed, 170 insertions(+), 34 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 968aca3e3ee9..80a3fca389b8 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1530,6 +1530,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
> ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
>
> + /* CONTINUE ioctl is only supported for MINOR ranges. */
> + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
> + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
> +
> /*
> * Now that we scanned all vmas we can already tell
> * userland which ioctls methods are guaranteed to
> @@ -1883,6 +1887,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
> return ret;
> }
>
> +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> +{
> + __s64 ret;
> + struct uffdio_continue uffdio_continue;
> + struct uffdio_continue __user *user_uffdio_continue;
> + struct userfaultfd_wake_range range;
> +
> + user_uffdio_continue = (struct uffdio_continue __user *)arg;
> +
> + ret = -EAGAIN;
> + if (READ_ONCE(ctx->mmap_changing))
> + goto out;
> +
> + ret = -EFAULT;
> + if (copy_from_user(&uffdio_continue, user_uffdio_continue,
> + /* don't copy the output fields */
> + sizeof(uffdio_continue) - (sizeof(__s64))))
> + goto out;
> +
> + ret = validate_range(ctx->mm, &uffdio_continue.range.start,
> + uffdio_continue.range.len);
> + if (ret)
> + goto out;
> +
> + ret = -EINVAL;
> + /* double check for wraparound just in case. */
> + if (uffdio_continue.range.start + uffdio_continue.range.len <=
> + uffdio_continue.range.start) {
> + goto out;
> + }
> + if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
> + goto out;
> +
> + if (mmget_not_zero(ctx->mm)) {
> + ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
> + uffdio_continue.range.len,
> + &ctx->mmap_changing);
> + mmput(ctx->mm);
> + } else {
> + return -ESRCH;
> + }
> +
> + if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
> + return -EFAULT;
> + if (ret < 0)
> + goto out;
> +
> + /* len == 0 would wake all */
> + BUG_ON(!ret);
> + range.len = ret;
> + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
> + range.start = uffdio_continue.range.start;
> + wake_userfault(ctx, &range);
> + }
> + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
> +
> +out:
> + return ret;
> +}
> +
> static inline unsigned int uffd_ctx_features(__u64 user_features)
> {
> /*
> @@ -1967,6 +2031,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
> case UFFDIO_WRITEPROTECT:
> ret = userfaultfd_writeprotect(ctx, arg);
> break;
> + case UFFDIO_CONTINUE:
> + ret = userfaultfd_continue(ctx, arg);
> + break;
> }
> return ret;
> }
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index f94a35296618..79e1f0155afa 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
> unsigned long hugetlb_total_pages(void);
> vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, unsigned int flags);
> +#ifdef CONFIG_USERFAULTFD
> int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
> struct vm_area_struct *dst_vma,
> unsigned long dst_addr,
> unsigned long src_addr,
> + enum mcopy_atomic_mode mode,
> struct page **pagep);
> +#endif
> int hugetlb_reserve_pages(struct inode *inode, long from, long to,
> struct vm_area_struct *vma,
> vm_flags_t vm_flags);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index fb9abaeb4194..2fcb686211e8 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
>
> extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
>
> +/*
> + * The mode of operation for __mcopy_atomic and its helpers.
> + *
> + * This is almost an implementation detail (mcopy_atomic below doesn't take this
> + * as a parameter), but it's exposed here because memory-kind-specific
> + * implementations (e.g. hugetlbfs) need to know the mode of operation.
> + */
> +enum mcopy_atomic_mode {
> + /* A normal copy_from_user into the destination range. */
> + MCOPY_ATOMIC_NORMAL,
> + /* Don't copy; map the destination range to the zero page. */
> + MCOPY_ATOMIC_ZEROPAGE,
> + /* Just setup the dst_vma, without modifying the underlying page(s). */
> + MCOPY_ATOMIC_CONTINUE,
> +};
> +
> extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> unsigned long src_start, unsigned long len,
> bool *mmap_changing, __u64 mode);
> @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long len,
> bool *mmap_changing);
> +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> + unsigned long len, bool *mmap_changing);
> extern int mwriteprotect_range(struct mm_struct *dst_mm,
> unsigned long start, unsigned long len,
> bool enable_wp, bool *mmap_changing);
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index f24dd4fcbad9..bafbeb1a2624 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -40,10 +40,12 @@
> ((__u64)1 << _UFFDIO_WAKE | \
> (__u64)1 << _UFFDIO_COPY | \
> (__u64)1 << _UFFDIO_ZEROPAGE | \
> - (__u64)1 << _UFFDIO_WRITEPROTECT)
> + (__u64)1 << _UFFDIO_WRITEPROTECT | \
> + (__u64)1 << _UFFDIO_CONTINUE)
> #define UFFD_API_RANGE_IOCTLS_BASIC \
> ((__u64)1 << _UFFDIO_WAKE | \
> - (__u64)1 << _UFFDIO_COPY)
> + (__u64)1 << _UFFDIO_COPY | \
> + (__u64)1 << _UFFDIO_CONTINUE)
>
> /*
> * Valid ioctl command number range with this API is from 0x00 to
> @@ -59,6 +61,7 @@
> #define _UFFDIO_COPY (0x03)
> #define _UFFDIO_ZEROPAGE (0x04)
> #define _UFFDIO_WRITEPROTECT (0x06)
> +#define _UFFDIO_CONTINUE (0x07)
> #define _UFFDIO_API (0x3F)
>
> /* userfaultfd ioctl ids */
> @@ -77,6 +80,8 @@
> struct uffdio_zeropage)
> #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
> struct uffdio_writeprotect)
> +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \
> + struct uffdio_continue)
>
> /* read() structure */
> struct uffd_msg {
> @@ -268,6 +273,18 @@ struct uffdio_writeprotect {
> __u64 mode;
> };
>
> +struct uffdio_continue {
> + struct uffdio_range range;
> +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
> + __u64 mode;
> +
> + /*
> + * Fields below here are written by the ioctl and must be at the end:
> + * the copy_from_user will not read past here.
> + */
> + __s64 mapped;
> +};
> +
> /*
> * Flags for the userfaultfd(2) system call itself.
> */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6f9d8349f818..3d318ef3d180 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4647,6 +4647,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> return ret;
> }
>
> +#ifdef CONFIG_USERFAULTFD
> /*
> * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
> * modifications for huge pages.
> @@ -4656,6 +4657,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> struct vm_area_struct *dst_vma,
> unsigned long dst_addr,
> unsigned long src_addr,
> + enum mcopy_atomic_mode mode,
> struct page **pagep)
> {
> struct address_space *mapping;
> @@ -4668,7 +4670,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> int ret;
> struct page *page;
>
> - if (!*pagep) {
> + mapping = dst_vma->vm_file->f_mapping;
> + idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> +
> + if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
> ret = -ENOMEM;
> page = alloc_huge_page(dst_vma, dst_addr, 0);
> if (IS_ERR(page))
> @@ -4685,6 +4690,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> /* don't free the page */
> goto out;
> }
> + } else if (mode == MCOPY_ATOMIC_CONTINUE) {
> + ret = -EFAULT;
> + page = find_lock_page(mapping, idx);
> + *pagep = NULL;
> + if (!page)
> + goto out;
> } else {
> page = *pagep;
> *pagep = NULL;
> @@ -4697,13 +4708,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> */
> __SetPageUptodate(page);
>
> - mapping = dst_vma->vm_file->f_mapping;
> - idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> -
> - /*
> - * If shared, add to page cache
> - */
> - if (vm_shared) {
> + /* Add shared, newly allocated pages to the page cache. */
> + if (vm_shared && mode != MCOPY_ATOMIC_CONTINUE) {
> size = i_size_read(mapping->host) >> huge_page_shift(h);
> ret = -EFAULT;
> if (idx >= size)
> @@ -4763,7 +4769,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> update_mmu_cache(dst_vma, dst_addr, dst_pte);
>
> spin_unlock(ptl);
> - set_page_huge_active(page);
> + if (mode != MCOPY_ATOMIC_CONTINUE)
> + set_page_huge_active(page);
> if (vm_shared)
> unlock_page(page);
> ret = 0;
> @@ -4777,6 +4784,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> put_page(page);
> goto out;
> }
> +#endif
>
> long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> struct page **pages, struct vm_area_struct **vmas,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index b2ce61c1b50d..a762b9cefaea 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - bool zeropage)
> + enum mcopy_atomic_mode mode)
> {
> int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
> int vm_shared = dst_vma->vm_flags & VM_SHARED;
> @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> * by THP. Since we can not reliably insert a zero page, this
> * feature is not supported.
> */
> - if (zeropage) {
> + if (mode == MCOPY_ATOMIC_ZEROPAGE) {
> mmap_read_unlock(dst_mm);
> return -EINVAL;
> }
> @@ -273,8 +273,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> }
>
> while (src_addr < src_start + len) {
> - pte_t dst_pteval;
> -
> BUG_ON(dst_addr >= dst_start + len);
>
> /*
> @@ -297,16 +295,17 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> goto out_unlock;
> }
>
> - err = -EEXIST;
> - dst_pteval = huge_ptep_get(dst_pte);
> - if (!huge_pte_none(dst_pteval)) {
> - mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> - i_mmap_unlock_read(mapping);
> - goto out_unlock;
> + if (mode != MCOPY_ATOMIC_CONTINUE) {
> + if (!huge_pte_none(huge_ptep_get(dst_pte))) {
> + err = -EEXIST;
> + mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> + i_mmap_unlock_read(mapping);
> + goto out_unlock;
> + }
> }
>
> err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
> - dst_addr, src_addr, &page);
> + dst_addr, src_addr, mode, &page);
>
> mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> i_mmap_unlock_read(mapping);
> @@ -408,7 +407,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - bool zeropage);
> + enum mcopy_atomic_mode mode);
> #endif /* CONFIG_HUGETLB_PAGE */
>
> static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> @@ -417,7 +416,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> unsigned long dst_addr,
> unsigned long src_addr,
> struct page **page,
> - bool zeropage,
> + enum mcopy_atomic_mode mode,
> bool wp_copy)
> {
> ssize_t err;
> @@ -433,22 +432,38 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> * and not in the radix tree.
> */
> if (!(dst_vma->vm_flags & VM_SHARED)) {
> - if (!zeropage)
> + switch (mode) {
> + case MCOPY_ATOMIC_NORMAL:
> err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> dst_addr, src_addr, page,
> wp_copy);
> - else
> + break;
> + case MCOPY_ATOMIC_ZEROPAGE:
> err = mfill_zeropage_pte(dst_mm, dst_pmd,
> dst_vma, dst_addr);
> + break;
> + /* It only makes sense to CONTINUE for shared memory. */
> + case MCOPY_ATOMIC_CONTINUE:
> + err = -EINVAL;
> + break;
> + }
> } else {
> VM_WARN_ON_ONCE(wp_copy);
> - if (!zeropage)
> + switch (mode) {
> + case MCOPY_ATOMIC_NORMAL:
> err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
> dst_vma, dst_addr,
> src_addr, page);
> - else
> + break;
> + case MCOPY_ATOMIC_ZEROPAGE:
> err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
> dst_vma, dst_addr);
> + break;
> + case MCOPY_ATOMIC_CONTINUE:
> + /* FIXME: Add minor fault interception for shmem. */

I hope you plan to implement the non-hugepage case soon. Looking
forward to using the feature. :)
> + err = -EINVAL;
> + break;
> + }
> }
>
> return err;
> @@ -458,7 +473,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - bool zeropage,
> + enum mcopy_atomic_mode mcopy_mode,
> bool *mmap_changing,
> __u64 mode)
> {
> @@ -527,7 +542,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> */
> if (is_vm_hugetlb_page(dst_vma))
> return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
> - src_start, len, zeropage);
> + src_start, len, mcopy_mode);
>
> if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> goto out_unlock;
> @@ -577,7 +592,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> BUG_ON(pmd_trans_huge(*dst_pmd));
>
> err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> - src_addr, &page, zeropage, wp_copy);
> + src_addr, &page, mcopy_mode, wp_copy);
> cond_resched();
>
> if (unlikely(err == -ENOENT)) {
> @@ -626,14 +641,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> unsigned long src_start, unsigned long len,
> bool *mmap_changing, __u64 mode)
> {
> - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
> - mmap_changing, mode);
> + return __mcopy_atomic(dst_mm, dst_start, src_start, len,
> + MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
> }
>
> ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
> unsigned long len, bool *mmap_changing)
> {
> - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
> + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
> + mmap_changing, 0);
> +}
> +
> +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
> + unsigned long len, bool *mmap_changing)
> +{
> + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
> + mmap_changing, 0);
> }
>
> int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> --
> 2.30.0.365.g02bc693789-goog
>