Re: [PATCH 1/2] userfaultfd: handle zeropage moves by UFFDIO_MOVE

From: Suren Baghdasaryan
Date: Thu Jan 25 2024 - 20:26:04 EST


On Wed, Jan 24, 2024 at 4:13 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> Current implementation of UFFDIO_MOVE fails to move zeropages and returns
> EBUSY when it encounters one. We can handle them by mapping a zeropage
> at the destination and clearing the mapping at the source. This is done
> both for ordinary and for huge zeropages.

I made a stupid mistake when formatting this patch and it says [PATCH
1/2] but it should be the only patch in the set. So, please do not
look for [2/2]. Sorry about the confusion.
Thanks,
Suren.

>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> ---
> Applies cleanly over mm-unstable branch.
>
> mm/huge_memory.c | 105 +++++++++++++++++++++++++++--------------------
> mm/userfaultfd.c | 42 +++++++++++++++----
> 2 files changed, 96 insertions(+), 51 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f40feb31b507..5dcc02c25e97 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2190,13 +2190,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
> }
>
> src_page = pmd_page(src_pmdval);
> - if (unlikely(!PageAnonExclusive(src_page))) {
> - spin_unlock(src_ptl);
> - return -EBUSY;
> - }
>
> - src_folio = page_folio(src_page);
> - folio_get(src_folio);
> + if (!is_huge_zero_pmd(src_pmdval)) {
> + if (unlikely(!PageAnonExclusive(src_page))) {
> + spin_unlock(src_ptl);
> + return -EBUSY;
> + }
> +
> + src_folio = page_folio(src_page);
> + folio_get(src_folio);
> + } else
> + src_folio = NULL;
> +
> spin_unlock(src_ptl);
>
> flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
> @@ -2204,19 +2209,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
> src_addr + HPAGE_PMD_SIZE);
> mmu_notifier_invalidate_range_start(&range);
>
> - folio_lock(src_folio);
> + if (src_folio) {
> + folio_lock(src_folio);
>
> - /*
> - * split_huge_page walks the anon_vma chain without the page
> - * lock. Serialize against it with the anon_vma lock, the page
> - * lock is not enough.
> - */
> - src_anon_vma = folio_get_anon_vma(src_folio);
> - if (!src_anon_vma) {
> - err = -EAGAIN;
> - goto unlock_folio;
> - }
> - anon_vma_lock_write(src_anon_vma);
> + /*
> + * split_huge_page walks the anon_vma chain without the page
> + * lock. Serialize against it with the anon_vma lock, the page
> + * lock is not enough.
> + */
> + src_anon_vma = folio_get_anon_vma(src_folio);
> + if (!src_anon_vma) {
> + err = -EAGAIN;
> + goto unlock_folio;
> + }
> + anon_vma_lock_write(src_anon_vma);
> + } else
> + src_anon_vma = NULL;
>
> dst_ptl = pmd_lockptr(mm, dst_pmd);
> double_pt_lock(src_ptl, dst_ptl);
> @@ -2225,45 +2233,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
> err = -EAGAIN;
> goto unlock_ptls;
> }
> - if (folio_maybe_dma_pinned(src_folio) ||
> - !PageAnonExclusive(&src_folio->page)) {
> - err = -EBUSY;
> - goto unlock_ptls;
> - }
> + if (src_folio) {
> + if (folio_maybe_dma_pinned(src_folio) ||
> + !PageAnonExclusive(&src_folio->page)) {
> + err = -EBUSY;
> + goto unlock_ptls;
> + }
>
> - if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
> - WARN_ON_ONCE(!folio_test_anon(src_folio))) {
> - err = -EBUSY;
> - goto unlock_ptls;
> - }
> + if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
> + WARN_ON_ONCE(!folio_test_anon(src_folio))) {
> + err = -EBUSY;
> + goto unlock_ptls;
> + }
>
> - folio_move_anon_rmap(src_folio, dst_vma);
> - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
> + folio_move_anon_rmap(src_folio, dst_vma);
> + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
>
> - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
> - /* Folio got pinned from under us. Put it back and fail the move. */
> - if (folio_maybe_dma_pinned(src_folio)) {
> - set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
> - err = -EBUSY;
> - goto unlock_ptls;
> - }
> + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
> + /* Folio got pinned from under us. Put it back and fail the move. */
> + if (folio_maybe_dma_pinned(src_folio)) {
> + set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
> + err = -EBUSY;
> + goto unlock_ptls;
> + }
>
> - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
> - /* Follow mremap() behavior and treat the entry dirty after the move */
> - _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
> + /* Follow mremap() behavior and treat the entry dirty after the move */
> + _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> + } else {
> + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
> + _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
> + }
> set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
>
> src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
> pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
> unlock_ptls:
> double_pt_unlock(src_ptl, dst_ptl);
> - anon_vma_unlock_write(src_anon_vma);
> - put_anon_vma(src_anon_vma);
> + if (src_anon_vma) {
> + anon_vma_unlock_write(src_anon_vma);
> + put_anon_vma(src_anon_vma);
> + }
> unlock_folio:
> /* unblock rmap walks */
> - folio_unlock(src_folio);
> + if (src_folio)
> + folio_unlock(src_folio);
> mmu_notifier_invalidate_range_end(&range);
> - folio_put(src_folio);
> + if (src_folio)
> + folio_put(src_folio);
> return err;
> }
> #endif /* CONFIG_USERFAULTFD */
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 3548b3e31a97..5fbf4da15c5c 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -959,6 +959,31 @@ static int move_swap_pte(struct mm_struct *mm,
> return 0;
> }
>
> +static int move_zeropage_pte(struct mm_struct *mm,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + pte_t *dst_pte, pte_t *src_pte,
> + pte_t orig_dst_pte, pte_t orig_src_pte,
> + spinlock_t *dst_ptl, spinlock_t *src_ptl)
> +{
> + pte_t zero_pte;
> +
> + double_pt_lock(dst_ptl, src_ptl);
> + if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
> + !pte_same(ptep_get(dst_pte), orig_dst_pte))
> + return -EAGAIN;
> +
> + zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
> + dst_vma->vm_page_prot));
> + ptep_clear_flush(src_vma, src_addr, src_pte);
> + set_pte_at(mm, dst_addr, dst_pte, zero_pte);
> + double_pt_unlock(dst_ptl, src_ptl);
> +
> + return 0;
> +}
> +
> +
> /*
> * The mmap_lock for reading is held by the caller. Just move the page
> * from src_pmd to dst_pmd if possible, and return true if succeeded
> @@ -1041,6 +1066,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
> }
>
> if (pte_present(orig_src_pte)) {
> + if (is_zero_pfn(pte_pfn(orig_src_pte))) {
> + err = move_zeropage_pte(mm, dst_vma, src_vma,
> + dst_addr, src_addr, dst_pte, src_pte,
> + orig_dst_pte, orig_src_pte,
> + dst_ptl, src_ptl);
> + goto out;
> + }
> +
> /*
> * Pin and lock both source folio and anon_vma. Since we are in
> * RCU read section, we can't block, so on contention have to
> @@ -1404,19 +1437,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
> err = -ENOENT;
> break;
> }
> - /* Avoid moving zeropages for now */
> - if (is_huge_zero_pmd(*src_pmd)) {
> - spin_unlock(ptl);
> - err = -EBUSY;
> - break;
> - }
>
> /* Check if we can move the pmd without splitting it. */
> if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
> !pmd_none(dst_pmdval)) {
> struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
>
> - if (!folio || !PageAnonExclusive(&folio->page)) {
> + if (!folio || (!is_huge_zero_page(&folio->page) &&
> + !PageAnonExclusive(&folio->page))) {
> spin_unlock(ptl);
> err = -EBUSY;
> break;
> --
> 2.43.0.429.g432eaa2c6b-goog
>