Re: [PATCH v2 1/6] mm: userfaultfd: add new UFFDIO_POISON ioctl

From: Axel Rasmussen
Date: Wed Jul 05 2023 - 14:24:54 EST


On Wed, Jul 5, 2023 at 11:18 AM Jiaqi Yan <jiaqiyan@xxxxxxxxxx> wrote:
>
> On Thu, Jun 29, 2023 at 1:50 PM Axel Rasmussen <axelrasmussen@xxxxxxxxxx> wrote:
> >
> > The basic idea here is to "simulate" memory poisoning for VMs. A VM
> > running on some host might encounter a memory error, after which some
> > page(s) are poisoned (i.e., future accesses SIGBUS). They expect that
> > once poisoned, pages can never become "un-poisoned". So, when we live
> > migrate the VM, we need to preserve the poisoned status of these pages.
> >
> > When live migrating, we try to get the guest running on its new host as
> > quickly as possible. So, we start it running before all memory has been
> > copied, and before we're certain which pages should be poisoned or not.
> >
> > So the basic way to use this new feature is:
> >
> > - On the new host, the guest's memory is registered with userfaultfd, in
> > either MISSING or MINOR mode (doesn't really matter for this purpose).
> > - On any first access, we get a userfaultfd event. At this point we can
> > communicate with the old host to find out if the page was poisoned.
> > - If so, we can respond with a UFFDIO_POISON - this places a swap marker
> > so any future accesses will SIGBUS. Because the pte is now "present",
> > future accesses won't generate more userfaultfd events, they'll just
> > SIGBUS directly.
> >
> > UFFDIO_POISON does not handle unmapping previously-present PTEs. This
>
> A minor suggestion, would UFFDIO_HWPOISON be better? so that readers
> won't be confused with CONFIG_PAGE_POISONING (a feature to fill the
> pages with poison patterns after free).

I was a bit wary of using "HWPOISON" because this is more like
"simulated" poisoning, not "real" hardware poisoning. :/

But, maybe folks feel that distinction is less important than the one
you've raised here Jiaqi. I don't have the strongest opinion.

>
> > isn't needed, because during live migration we want to intercept
> > all accesses with userfaultfd (not just writes, so WP mode isn't useful
> > for this). So whether minor or missing mode is being used (or both), the
> > PTE won't be present in any case, so handling that case isn't needed.
> >
> > Why return VM_FAULT_HWPOISON instead of VM_FAULT_SIGBUS when one of
> > these markers is encountered? For "normal" userspace programs there
> > isn't a big difference, both yield a SIGBUS. The difference for KVM is
> > key though: VM_FAULT_HWPOISON will result in an MCE being injected into
> > the guest (which is the behavior we want). With VM_FAULT_SIGBUS, the
> > hypervisor would need to catch the SIGBUS and deal with the MCE
> > injection itself.
> >
> > Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
> > ---
> > fs/userfaultfd.c | 63 ++++++++++++++++++++++++++++++++
> > include/linux/swapops.h | 3 +-
> > include/linux/userfaultfd_k.h | 4 ++
> > include/uapi/linux/userfaultfd.h | 25 +++++++++++--
> > mm/memory.c | 4 ++
> > mm/userfaultfd.c | 62 ++++++++++++++++++++++++++++++-
> > 6 files changed, 156 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 7cecd49e078b..c26a883399c9 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1965,6 +1965,66 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> > return ret;
> > }
> >
> > +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
> > +{
> > + __s64 ret;
> > + struct uffdio_poison uffdio_poison;
> > + struct uffdio_poison __user *user_uffdio_poison;
> > + struct userfaultfd_wake_range range;
> > +
> > + user_uffdio_poison = (struct uffdio_poison __user *)arg;
> > +
> > + ret = -EAGAIN;
> > + if (atomic_read(&ctx->mmap_changing))
> > + goto out;
> > +
> > + ret = -EFAULT;
> > + if (copy_from_user(&uffdio_poison, user_uffdio_poison,
> > + /* don't copy the output fields */
> > + sizeof(uffdio_poison) - (sizeof(__s64))))
> > + goto out;
> > +
> > + ret = validate_range(ctx->mm, uffdio_poison.range.start,
> > + uffdio_poison.range.len);
> > + if (ret)
> > + goto out;
> > +
> > + ret = -EINVAL;
> > + /* double check for wraparound just in case. */
> > + if (uffdio_poison.range.start + uffdio_poison.range.len <=
> > + uffdio_poison.range.start) {
> > + goto out;
> > + }
> > + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
> > + goto out;
> > +
> > + if (mmget_not_zero(ctx->mm)) {
> > + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> > + uffdio_poison.range.len,
> > + &ctx->mmap_changing, 0);
> > + mmput(ctx->mm);
> > + } else {
> > + return -ESRCH;
> > + }
> > +
> > + if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
> > + return -EFAULT;
> > + if (ret < 0)
> > + goto out;
> > +
> > + /* len == 0 would wake all */
> > + BUG_ON(!ret);
> > + range.len = ret;
> > + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
> > + range.start = uffdio_poison.range.start;
> > + wake_userfault(ctx, &range);
> > + }
> > + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
> > +
> > +out:
> > + return ret;
> > +}
> > +
> > static inline unsigned int uffd_ctx_features(__u64 user_features)
> > {
> > /*
> > @@ -2066,6 +2126,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
> > case UFFDIO_CONTINUE:
> > ret = userfaultfd_continue(ctx, arg);
> > break;
> > + case UFFDIO_POISON:
> > + ret = userfaultfd_poison(ctx, arg);
> > + break;
> > }
> > return ret;
> > }
> > diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> > index 4c932cb45e0b..8259fee32421 100644
> > --- a/include/linux/swapops.h
> > +++ b/include/linux/swapops.h
> > @@ -394,7 +394,8 @@ typedef unsigned long pte_marker;
> >
> > #define PTE_MARKER_UFFD_WP BIT(0)
> > #define PTE_MARKER_SWAPIN_ERROR BIT(1)
> > -#define PTE_MARKER_MASK (BIT(2) - 1)
> > +#define PTE_MARKER_UFFD_POISON BIT(2)
> > +#define PTE_MARKER_MASK (BIT(3) - 1)
> >
> > static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> > {
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index ac7b0c96d351..ac8c6854097c 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -46,6 +46,7 @@ enum mfill_atomic_mode {
> > MFILL_ATOMIC_COPY,
> > MFILL_ATOMIC_ZEROPAGE,
> > MFILL_ATOMIC_CONTINUE,
> > + MFILL_ATOMIC_POISON,
> > NR_MFILL_ATOMIC_MODES,
> > };
> >
> > @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> > extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> > unsigned long len, atomic_t *mmap_changing,
> > uffd_flags_t flags);
> > +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > + unsigned long len, atomic_t *mmap_changing,
> > + uffd_flags_t flags);
> > extern int mwriteprotect_range(struct mm_struct *dst_mm,
> > unsigned long start, unsigned long len,
> > bool enable_wp, atomic_t *mmap_changing);
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 66dd4cd277bd..62151706c5a3 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -39,7 +39,8 @@
> > UFFD_FEATURE_MINOR_SHMEM | \
> > UFFD_FEATURE_EXACT_ADDRESS | \
> > UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
> > - UFFD_FEATURE_WP_UNPOPULATED)
> > + UFFD_FEATURE_WP_UNPOPULATED | \
> > + UFFD_FEATURE_POISON)
> > #define UFFD_API_IOCTLS \
> > ((__u64)1 << _UFFDIO_REGISTER | \
> > (__u64)1 << _UFFDIO_UNREGISTER | \
> > @@ -49,12 +50,14 @@
> > (__u64)1 << _UFFDIO_COPY | \
> > (__u64)1 << _UFFDIO_ZEROPAGE | \
> > (__u64)1 << _UFFDIO_WRITEPROTECT | \
> > - (__u64)1 << _UFFDIO_CONTINUE)
> > + (__u64)1 << _UFFDIO_CONTINUE | \
> > + (__u64)1 << _UFFDIO_POISON)
> > #define UFFD_API_RANGE_IOCTLS_BASIC \
> > ((__u64)1 << _UFFDIO_WAKE | \
> > (__u64)1 << _UFFDIO_COPY | \
> > + (__u64)1 << _UFFDIO_WRITEPROTECT | \
> > (__u64)1 << _UFFDIO_CONTINUE | \
> > - (__u64)1 << _UFFDIO_WRITEPROTECT)
> > + (__u64)1 << _UFFDIO_POISON)
> >
> > /*
> > * Valid ioctl command number range with this API is from 0x00 to
> > @@ -71,6 +74,7 @@
> > #define _UFFDIO_ZEROPAGE (0x04)
> > #define _UFFDIO_WRITEPROTECT (0x06)
> > #define _UFFDIO_CONTINUE (0x07)
> > +#define _UFFDIO_POISON (0x08)
> > #define _UFFDIO_API (0x3F)
> >
> > /* userfaultfd ioctl ids */
> > @@ -91,6 +95,8 @@
> > struct uffdio_writeprotect)
> > #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
> > struct uffdio_continue)
> > +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
> > + struct uffdio_poison)
> >
> > /* read() structure */
> > struct uffd_msg {
> > @@ -225,6 +231,7 @@ struct uffdio_api {
> > #define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
> > #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
> > #define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
> > +#define UFFD_FEATURE_POISON (1<<14)
> > __u64 features;
> >
> > __u64 ioctls;
> > @@ -321,6 +328,18 @@ struct uffdio_continue {
> > __s64 mapped;
> > };
> >
> > +struct uffdio_poison {
> > + struct uffdio_range range;
> > +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0)
> > + __u64 mode;
> > +
> > + /*
> > + * Fields below here are written by the ioctl and must be at the end:
> > + * the copy_from_user will not read past here.
> > + */
> > + __s64 updated;
> > +};
> > +
> > /*
> > * Flags for the userfaultfd(2) system call itself.
> > */
> > diff --git a/mm/memory.c b/mm/memory.c
> > index d8a9a770b1f1..7fbda39e060d 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3692,6 +3692,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> > if (WARN_ON_ONCE(!marker))
> > return VM_FAULT_SIGBUS;
> >
> > + /* Poison emulation explicitly requested for this PTE. */
> > + if (marker & PTE_MARKER_UFFD_POISON)
> > + return VM_FAULT_HWPOISON;
> > +
> > /* Higher priority than uffd-wp when data corrupted */
> > if (marker & PTE_MARKER_SWAPIN_ERROR)
> > return VM_FAULT_SIGBUS;
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index a2bf37ee276d..87b62ca1e09e 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -286,6 +286,51 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
> > goto out;
> > }
> >
> > +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
> > +static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
> > + struct vm_area_struct *dst_vma,
> > + unsigned long dst_addr,
> > + uffd_flags_t flags)
> > +{
> > + int ret;
> > + struct mm_struct *dst_mm = dst_vma->vm_mm;
> > + pte_t _dst_pte, *dst_pte;
> > + spinlock_t *ptl;
> > +
> > + _dst_pte = make_pte_marker(PTE_MARKER_UFFD_POISON);
> > + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> > +
> > + if (vma_is_shmem(dst_vma)) {
> > + struct inode *inode;
> > + pgoff_t offset, max_off;
> > +
> > + /* serialize against truncate with the page table lock */
> > + inode = dst_vma->vm_file->f_inode;
> > + offset = linear_page_index(dst_vma, dst_addr);
> > + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > + ret = -EFAULT;
> > + if (unlikely(offset >= max_off))
> > + goto out_unlock;
> > + }
> > +
> > + ret = -EEXIST;
> > + /*
> > + * For now, we don't handle unmapping pages, so only support filling in
> > + * none PTEs, or replacing PTE markers.
> > + */
> > + if (!pte_none_mostly(*dst_pte))
> > + goto out_unlock;
> > +
> > + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> > +
> > + /* No need to invalidate - it was non-present before */
> > + update_mmu_cache(dst_vma, dst_addr, dst_pte);
> > + ret = 0;
> > +out_unlock:
> > + pte_unmap_unlock(dst_pte, ptl);
> > + return ret;
> > +}
> > +
> > static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> > {
> > pgd_t *pgd;
> > @@ -336,8 +381,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > * supported by hugetlb. A PMD_SIZE huge pages may exist as used
> > * by THP. Since we can not reliably insert a zero page, this
> > * feature is not supported.
> > + *
> > + * PTE marker handling for hugetlb is a bit special, so for now
> > + * UFFDIO_POISON is not supported.
> > */
> > - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) ||
> > + uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
> > mmap_read_unlock(dst_mm);
> > return -EINVAL;
> > }
> > @@ -481,6 +530,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> > if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
> > return mfill_atomic_pte_continue(dst_pmd, dst_vma,
> > dst_addr, flags);
> > + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
> > + return mfill_atomic_pte_poison(dst_pmd, dst_vma,
> > + dst_addr, flags);
> > }
> >
> > /*
> > @@ -702,6 +754,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> > uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
> > }
> >
> > +ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > + unsigned long len, atomic_t *mmap_changing,
> > + uffd_flags_t flags)
> > +{
> > + return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> > + uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
> > +}
> > +
> > long uffd_wp_range(struct vm_area_struct *dst_vma,
> > unsigned long start, unsigned long len, bool enable_wp)
> > {
> > --
> > 2.41.0.255.g8b1d071c50-goog
> >