Re: [PATCH v2 2/3] userfaultfd: UFFDIO_REMAP uABI

From: Jann Horn
Date: Wed Sep 27 2023 - 08:47:58 EST


On Sat, Sep 23, 2023 at 3:31 AM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
> From: Andrea Arcangeli <aarcange@xxxxxxxxxx>
>
> This implements the uABI of UFFDIO_REMAP.
>
> Notably one mode bitflag is also forwarded (and in turn known) by the
> lowlevel remap_pages method.
>
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
[...]
> +int remap_pages_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> + pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr, unsigned long src_addr)
> +{
> + pmd_t _dst_pmd, src_pmdval;
> + struct page *src_page;
> + struct folio *src_folio;
> + struct anon_vma *src_anon_vma, *dst_anon_vma;
> + spinlock_t *src_ptl, *dst_ptl;
> + pgtable_t src_pgtable, dst_pgtable;
> + struct mmu_notifier_range range;
> + int err = 0;
> +
> + src_pmdval = *src_pmd;
> + src_ptl = pmd_lockptr(src_mm, src_pmd);
> +
> + BUG_ON(!spin_is_locked(src_ptl));
> + mmap_assert_locked(src_mm);
> + mmap_assert_locked(dst_mm);
> +
> + BUG_ON(!pmd_trans_huge(src_pmdval));
> + BUG_ON(!pmd_none(dst_pmdval));
> + BUG_ON(src_addr & ~HPAGE_PMD_MASK);
> + BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
> +
> + src_page = pmd_page(src_pmdval);
> + if (unlikely(!PageAnonExclusive(src_page))) {
> + spin_unlock(src_ptl);
> + return -EBUSY;
> + }
> +
> + src_folio = page_folio(src_page);
> + folio_get(src_folio);
> + spin_unlock(src_ptl);
> +
> + /* preallocate dst_pgtable if needed */
> + if (dst_mm != src_mm) {
> + dst_pgtable = pte_alloc_one(dst_mm);
> + if (unlikely(!dst_pgtable)) {
> + err = -ENOMEM;
> + goto put_folio;
> + }
> + } else {
> + dst_pgtable = NULL;
> + }
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr,
> + src_addr + HPAGE_PMD_SIZE);
> + mmu_notifier_invalidate_range_start(&range);
> +
> + /* block all concurrent rmap walks */
> + folio_lock(src_folio);
> +
> + /*
> + * split_huge_page walks the anon_vma chain without the page
> + * lock. Serialize against it with the anon_vma lock, the page
> + * lock is not enough.
> + */
> + src_anon_vma = folio_get_anon_vma(src_folio);
> + if (!src_anon_vma) {
> + err = -EAGAIN;
> + goto unlock_folio;
> + }
> + anon_vma_lock_write(src_anon_vma);
> +
> + dst_ptl = pmd_lockptr(dst_mm, dst_pmd);
> + double_pt_lock(src_ptl, dst_ptl);
> + if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
> + !pmd_same(*dst_pmd, dst_pmdval) ||
> + folio_mapcount(src_folio) != 1)) {

I think this is also supposed to be PageAnonExclusive()?

> + double_pt_unlock(src_ptl, dst_ptl);
> + err = -EAGAIN;
> + goto put_anon_vma;
> + }
> +
> + BUG_ON(!folio_test_head(src_folio));
> + BUG_ON(!folio_test_anon(src_folio));
> +
> + dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON;
> + WRITE_ONCE(src_folio->mapping, (struct address_space *) dst_anon_vma);
> + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
> +
> + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
> + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
> + _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> + set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd);
> +
> + src_pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd);
> + if (dst_pgtable) {
> + pgtable_trans_huge_deposit(dst_mm, dst_pmd, dst_pgtable);
> + pte_free(src_mm, src_pgtable);
> + dst_pgtable = NULL;
> +
> + mm_inc_nr_ptes(dst_mm);
> + mm_dec_nr_ptes(src_mm);
> + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> + add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> + } else {
> + pgtable_trans_huge_deposit(dst_mm, dst_pmd, src_pgtable);
> + }
> + double_pt_unlock(src_ptl, dst_ptl);
> +
> +put_anon_vma:
> + anon_vma_unlock_write(src_anon_vma);
> + put_anon_vma(src_anon_vma);
> +unlock_folio:
> + /* unblock rmap walks */
> + folio_unlock(src_folio);
> + mmu_notifier_invalidate_range_end(&range);
> + if (dst_pgtable)
> + pte_free(dst_mm, dst_pgtable);
> +put_folio:
> + folio_put(src_folio);
> +
> + return err;
> +}
> +#endif /* CONFIG_USERFAULTFD */
[...]
> +static int remap_anon_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + pte_t *dst_pte, pte_t *src_pte,
> + pte_t orig_dst_pte, pte_t orig_src_pte,
> + spinlock_t *dst_ptl, spinlock_t *src_ptl,
> + struct folio *src_folio)
> +{
> + struct anon_vma *dst_anon_vma;
> +
> + double_pt_lock(dst_ptl, src_ptl);
> +
> + if (!pte_same(*src_pte, orig_src_pte) ||
> + !pte_same(*dst_pte, orig_dst_pte) ||
> + folio_test_large(src_folio) ||
> + folio_estimated_sharers(src_folio) != 1) {
> + double_pt_unlock(dst_ptl, src_ptl);
> + return -EAGAIN;
> + }
> +
> + BUG_ON(!folio_test_anon(src_folio));
> +
> + dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON;
> + WRITE_ONCE(src_folio->mapping,
> + (struct address_space *) dst_anon_vma);
> + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma,
> + dst_addr));
> +
> + orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
> + orig_dst_pte = maybe_mkwrite(pte_mkdirty(orig_dst_pte),
> + dst_vma);

I think there's still a theoretical issue here that you could fix by
checking for the AnonExclusive flag, similar to the huge page case.

Consider the following scenario:

1. process P1 does a write fault in a private anonymous VMA, creating
and mapping a new anonymous page A1
2. process P1 forks and creates two children P2 and P3. afterwards, A1
is mapped in P1, P2 and P3 as a COW page, with mapcount 3.
3. process P1 removes its mapping of A1, dropping its mapcount to 2.
4. process P2 uses vmsplice() to grab a reference to A1 with get_user_pages()
5. process P2 removes its mapping of A1, dropping its mapcount to 1.

If at this point P3 does a write fault on its mapping of A1, it will
still trigger copy-on-write thanks to the AnonExclusive mechanism; and
this is necessary to avoid P3 mapping A1 as writable and writing data
into it that will become visible to P2, if P2 and P3 are in different
security contexts.

But if P3 instead moves its mapping of A1 to another address with
remap_anon_pte() which only does a page mapcount check, the
maybe_mkwrite() will directly make the mapping writable, circumventing
the AnonExclusive mechanism.

> + set_pte_at(dst_mm, dst_addr, dst_pte, orig_dst_pte);
> +
> + if (dst_mm != src_mm) {
> + inc_mm_counter(dst_mm, MM_ANONPAGES);
> + dec_mm_counter(src_mm, MM_ANONPAGES);
> + }
> +
> + double_pt_unlock(dst_ptl, src_ptl);
> +
> + return 0;
> +}
> +
> +static int remap_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> + unsigned long dst_addr, unsigned long src_addr,
> + pte_t *dst_pte, pte_t *src_pte,
> + pte_t orig_dst_pte, pte_t orig_src_pte,
> + spinlock_t *dst_ptl, spinlock_t *src_ptl)
> +{
> + if (!pte_swp_exclusive(orig_src_pte))
> + return -EBUSY;
> +
> + double_pt_lock(dst_ptl, src_ptl);
> +
> + if (!pte_same(*src_pte, orig_src_pte) ||
> + !pte_same(*dst_pte, orig_dst_pte)) {
> + double_pt_unlock(dst_ptl, src_ptl);
> + return -EAGAIN;
> + }
> +
> + orig_src_pte = ptep_get_and_clear(src_mm, src_addr, src_pte);
> + set_pte_at(dst_mm, dst_addr, dst_pte, orig_src_pte);
> +
> + if (dst_mm != src_mm) {
> + inc_mm_counter(dst_mm, MM_ANONPAGES);
> + dec_mm_counter(src_mm, MM_ANONPAGES);

I think this is the wrong counter. Looking at zap_pte_range(), in the
"Genuine swap entry" case, we modify the MM_SWAPENTS counter, not
MM_ANONPAGES.

> + }
> +
> + double_pt_unlock(dst_ptl, src_ptl);
> +
> + return 0;
> +}
> +
> +/*
> + * The mmap_lock for reading is held by the caller. Just move the page
> + * from src_pmd to dst_pmd if possible, and return true if succeeded
> + * in moving the page.
> + */
> +static int remap_pages_pte(struct mm_struct *dst_mm,
> + struct mm_struct *src_mm,
> + pmd_t *dst_pmd,
> + pmd_t *src_pmd,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr,
> + unsigned long src_addr,
> + __u64 mode)
> +{
> + swp_entry_t entry;
> + pte_t orig_src_pte, orig_dst_pte;
> + spinlock_t *src_ptl, *dst_ptl;
> + pte_t *src_pte = NULL;
> + pte_t *dst_pte = NULL;
> +
> + struct folio *src_folio = NULL;
> + struct anon_vma *src_anon_vma = NULL;
> + struct mmu_notifier_range range;
> + int err = 0;
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm,
> + src_addr, src_addr + PAGE_SIZE);
> + mmu_notifier_invalidate_range_start(&range);
> +retry:

This retry looks a bit dodgy. On this retry label, we restart almost
the entire operation, including re-reading the source PTE; the only
variables that carry state forward from the previous retry loop
iteration are src_folio and src_anon_vma.

> + dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
> +
> + /* If an huge pmd materialized from under us fail */
> + if (unlikely(!dst_pte)) {
> + err = -EFAULT;
> + goto out;
> + }
[...]
> + spin_lock(dst_ptl);
> + orig_dst_pte = *dst_pte;
> + spin_unlock(dst_ptl);
> + if (!pte_none(orig_dst_pte)) {
> + err = -EEXIST;
> + goto out;
> + }
> +
> + spin_lock(src_ptl);
> + orig_src_pte = *src_pte;

Here we read an entirely new orig_src_pte value. Something like a
concurrent MADV_DONTNEED+pagefault could have made the PTE point to a
different page between loop iterations.

> + spin_unlock(src_ptl);

I think you have to insert something like the following here:

if (src_folio && (orig_dst_pte != previous_src_pte)) {
err = -EAGAIN;
goto out;
}
previous_src_pte = orig_dst_pte;

Otherwise:

> + if (pte_none(orig_src_pte)) {
> + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
> + err = -ENOENT;
> + else /* nothing to do to remap a hole */
> + err = 0;
> + goto out;
> + }
> +
> + if (pte_present(orig_src_pte)) {
> + /*
> + * Pin and lock both source folio and anon_vma. Since we are in
> + * RCU read section, we can't block, so on contention have to
> + * unmap the ptes, obtain the lock and retry.
> + */
> + if (!src_folio) {

If we already found a src_folio in the previous iteration (but the
trylock failed), we keep using the same src_folio without rechecking
if the current PTE still points to the same folio.

> + struct folio *folio;
> +
> + /*
> + * Pin the page while holding the lock to be sure the
> + * page isn't freed under us
> + */
> + spin_lock(src_ptl);
> + if (!pte_same(orig_src_pte, *src_pte)) {
> + spin_unlock(src_ptl);
> + err = -EAGAIN;
> + goto out;
> + }
> +
> + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> + if (!folio || !folio_test_anon(folio) ||
> + folio_test_large(folio) ||
> + folio_estimated_sharers(folio) != 1) {
> + spin_unlock(src_ptl);
> + err = -EBUSY;
> + goto out;
> + }
> +
> + folio_get(folio);
> + src_folio = folio;
> + spin_unlock(src_ptl);
> +
> + /* block all concurrent rmap walks */
> + if (!folio_trylock(src_folio)) {
> + pte_unmap(&orig_src_pte);
> + pte_unmap(&orig_dst_pte);
> + src_pte = dst_pte = NULL;
> + /* now we can block and wait */
> + folio_lock(src_folio);
> + goto retry;
> + }
> + }
> +
> + if (!src_anon_vma) {

(And here, if we already saw a src_anon_vma but the trylock failed,
we'll keep using that src_anon_vma.)

> + /*
> + * folio_referenced walks the anon_vma chain
> + * without the folio lock. Serialize against it with
> + * the anon_vma lock, the folio lock is not enough.
> + */
> + src_anon_vma = folio_get_anon_vma(src_folio);
> + if (!src_anon_vma) {
> + /* page was unmapped from under us */
> + err = -EAGAIN;
> + goto out;
> + }
> + if (!anon_vma_trylock_write(src_anon_vma)) {
> + pte_unmap(&orig_src_pte);
> + pte_unmap(&orig_dst_pte);
> + src_pte = dst_pte = NULL;
> + /* now we can block and wait */
> + anon_vma_lock_write(src_anon_vma);
> + goto retry;
> + }
> + }

So at this point we have:

- the current src_pte
- some referenced+locked src_folio that used to be mapped exclusively
at src_addr
- (the anon_vma associated with the src_folio)

> + err = remap_anon_pte(dst_mm, src_mm, dst_vma, src_vma,
> + dst_addr, src_addr, dst_pte, src_pte,
> + orig_dst_pte, orig_src_pte,
> + dst_ptl, src_ptl, src_folio);

And then this will, without touching folio mapcounts/refcounts, delete
the current PTE at src_addr, and create a PTE at dst_addr pointing to
the old src_folio, leading to incorrect refcounts/mapcounts?

> + } else {
[...]
> + }
> +
> +out:
> + if (src_anon_vma) {
> + anon_vma_unlock_write(src_anon_vma);
> + put_anon_vma(src_anon_vma);
> + }
> + if (src_folio) {
> + folio_unlock(src_folio);
> + folio_put(src_folio);
> + }
> + if (dst_pte)
> + pte_unmap(dst_pte);
> + if (src_pte)
> + pte_unmap(src_pte);
> + mmu_notifier_invalidate_range_end(&range);
> +
> + return err;
> +}
[...]
> +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> + unsigned long dst_start, unsigned long src_start,
> + unsigned long len, __u64 mode)
> +{
> + struct vm_area_struct *src_vma, *dst_vma;
> + unsigned long src_addr, dst_addr;
> + pmd_t *src_pmd, *dst_pmd;
> + long err = -EINVAL;
> + ssize_t moved = 0;
> +
> + /*
> + * Sanitize the command parameters:
> + */
> + BUG_ON(src_start & ~PAGE_MASK);
> + BUG_ON(dst_start & ~PAGE_MASK);
> + BUG_ON(len & ~PAGE_MASK);
> +
> + /* Does the address range wrap, or is the span zero-sized? */
> + BUG_ON(src_start + len <= src_start);
> + BUG_ON(dst_start + len <= dst_start);
> +
> + /*
> + * Because these are read sempahores there's no risk of lock
> + * inversion.
> + */
> + mmap_read_lock(dst_mm);
> + if (dst_mm != src_mm)
> + mmap_read_lock(src_mm);
> +
> + /*
> + * Make sure the vma is not shared, that the src and dst remap
> + * ranges are both valid and fully within a single existing
> + * vma.
> + */
> + src_vma = find_vma(src_mm, src_start);
> + if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> + goto out;
> + if (src_start < src_vma->vm_start ||
> + src_start + len > src_vma->vm_end)
> + goto out;
> +
> + dst_vma = find_vma(dst_mm, dst_start);
> + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> + goto out;
> + if (dst_start < dst_vma->vm_start ||
> + dst_start + len > dst_vma->vm_end)
> + goto out;
> +
> + err = validate_remap_areas(src_vma, dst_vma);
> + if (err)
> + goto out;
> +
> + for (src_addr = src_start, dst_addr = dst_start;
> + src_addr < src_start + len;) {
> + spinlock_t *ptl;
> + pmd_t dst_pmdval;
> + unsigned long step_size;
> +
> + BUG_ON(dst_addr >= dst_start + len);
> + /*
> + * Below works because anonymous area would not have a
> + * transparent huge PUD. If file-backed support is added,
> + * that case would need to be handled here.
> + */
> + src_pmd = mm_find_pmd(src_mm, src_addr);
> + if (unlikely(!src_pmd)) {
> + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> + err = -ENOENT;
> + break;
> + }
> + src_pmd = mm_alloc_pmd(src_mm, src_addr);
> + if (unlikely(!src_pmd)) {
> + err = -ENOMEM;
> + break;
> + }
> + }
> + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> + if (unlikely(!dst_pmd)) {
> + err = -ENOMEM;
> + break;
> + }
> +
> + dst_pmdval = pmdp_get_lockless(dst_pmd);
> + /*
> + * If the dst_pmd is mapped as THP don't override it and just
> + * be strict. If dst_pmd changes into TPH after this check, the
> + * remap_pages_huge_pmd() will detect the change and retry
> + * while remap_pages_pte() will detect the change and fail.
> + */
> + if (unlikely(pmd_trans_huge(dst_pmdval))) {
> + err = -EEXIST;
> + break;
> + }
> +
> + ptl = pmd_trans_huge_lock(src_pmd, src_vma);
> + if (ptl && !pmd_trans_huge(*src_pmd)) {
> + spin_unlock(ptl);
> + ptl = NULL;
> + }

This still looks wrong - we do still have to split_huge_pmd()
somewhere so that remap_pages_pte() works.

> + if (ptl) {
> + /*
> + * Check if we can move the pmd without
> + * splitting it. First check the address
> + * alignment to be the same in src/dst. These
> + * checks don't actually need the PT lock but
> + * it's good to do it here to optimize this
> + * block away at build time if
> + * CONFIG_TRANSPARENT_HUGEPAGE is not set.
> + */
> + if ((src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
> + src_start + len - src_addr < HPAGE_PMD_SIZE || !pmd_none(dst_pmdval)) {
> + spin_unlock(ptl);
> + split_huge_pmd(src_vma, src_pmd, src_addr);
> + continue;
> + }
> +
> + err = remap_pages_huge_pmd(dst_mm, src_mm,
> + dst_pmd, src_pmd,
> + dst_pmdval,
> + dst_vma, src_vma,
> + dst_addr, src_addr);
> + step_size = HPAGE_PMD_SIZE;
> + } else {
> + if (pmd_none(*src_pmd)) {
> + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> + err = -ENOENT;
> + break;
> + }
> + if (unlikely(__pte_alloc(src_mm, src_pmd))) {
> + err = -ENOMEM;
> + break;
> + }
> + }
> +
> + if (unlikely(pte_alloc(dst_mm, dst_pmd))) {
> + err = -ENOMEM;
> + break;
> + }
> +
> + err = remap_pages_pte(dst_mm, src_mm,
> + dst_pmd, src_pmd,
> + dst_vma, src_vma,
> + dst_addr, src_addr,
> + mode);
> + step_size = PAGE_SIZE;
> + }
> +
> + cond_resched();
> +
> + if (!err) {
> + dst_addr += step_size;
> + src_addr += step_size;
> + moved += step_size;
> + }
> +
> + if ((!err || err == -EAGAIN) &&
> + fatal_signal_pending(current))
> + err = -EINTR;
> +
> + if (err && err != -EAGAIN)
> + break;
> + }
> +
> +out:
> + mmap_read_unlock(dst_mm);
> + if (dst_mm != src_mm)
> + mmap_read_unlock(src_mm);
> + BUG_ON(moved < 0);
> + BUG_ON(err > 0);
> + BUG_ON(!moved && !err);
> + return moved ? moved : err;
> +}
> --
> 2.42.0.515.g380fc7ccd1-goog
>