Re: [PATCH 7/7] mm/gup: Retire follow_hugetlb_page()

From: Lorenzo Stoakes
Date: Sat Jun 17 2023 - 16:40:59 EST


On Tue, Jun 13, 2023 at 05:53:46PM -0400, Peter Xu wrote:
> Now __get_user_pages() should be well prepared to handle thp completely,
> as long as hugetlb gup requests even without the hugetlb's special path.
>
> Time to retire follow_hugetlb_page().

Nit, but there's a couple left over references to this function in comments
in fs/userfaultfd.c and mm/hugetlb.c.

>
> Tweak the comments in follow_page_mask() to reflect reality, by dropping
> the "follow_page()" description.
>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
> include/linux/hugetlb.h | 12 ---
> mm/gup.c | 19 ----
> mm/hugetlb.c | 223 ----------------------------------------
> 3 files changed, 254 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 0d6f389d98de..44e5836eed15 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
> struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> unsigned long address, unsigned int flags,
> unsigned int *page_mask);
> -long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
> - struct page **, unsigned long *, unsigned long *,
> - long, unsigned int, int *);
> void unmap_hugepage_range(struct vm_area_struct *,
> unsigned long, unsigned long, struct page *,
> zap_flags_t);
> @@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
> BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
> }
>
> -static inline long follow_hugetlb_page(struct mm_struct *mm,
> - struct vm_area_struct *vma, struct page **pages,
> - unsigned long *position, unsigned long *nr_pages,
> - long i, unsigned int flags, int *nonblocking)
> -{
> - BUG();
> - return 0;
> -}
> -
> static inline int copy_hugetlb_page_range(struct mm_struct *dst,
> struct mm_struct *src,
> struct vm_area_struct *dst_vma,
> diff --git a/mm/gup.c b/mm/gup.c
> index cdabc8ea783b..a65b80953b7a 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -789,9 +789,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
> * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> * special hugetlb page table walking code. This eliminates the
> * need to check for hugetlb entries in the general walking code.
> - *
> - * hugetlb_follow_page_mask is only for follow_page() handling here.
> - * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
> */
> if (is_vm_hugetlb_page(vma))
> return hugetlb_follow_page_mask(vma, address, flags,
> @@ -1149,22 +1146,6 @@ static long __get_user_pages(struct mm_struct *mm,
> ret = check_vma_flags(vma, gup_flags);
> if (ret)
> goto out;
> -
> - if (is_vm_hugetlb_page(vma)) {
> - i = follow_hugetlb_page(mm, vma, pages,
> - &start, &nr_pages, i,
> - gup_flags, locked);
> - if (!*locked) {
> - /*
> - * We've got a VM_FAULT_RETRY
> - * and we've lost mmap_lock.
> - * We must stop here.
> - */
> - BUG_ON(gup_flags & FOLL_NOWAIT);
> - goto out;
> - }
> - continue;
> - }
> }
> retry:
> /*
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 31d8f18bc2e4..b7ff413ff68b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6425,37 +6425,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
> }
> #endif /* CONFIG_USERFAULTFD */
>
> -static void record_subpages(struct page *page, struct vm_area_struct *vma,
> - int refs, struct page **pages)
> -{
> - int nr;
> -
> - for (nr = 0; nr < refs; nr++) {
> - if (likely(pages))
> - pages[nr] = nth_page(page, nr);
> - }
> -}
> -
> -static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
> - unsigned int flags, pte_t *pte,
> - bool *unshare)
> -{
> - pte_t pteval = huge_ptep_get(pte);
> -
> - *unshare = false;
> - if (is_swap_pte(pteval))
> - return true;
> - if (huge_pte_write(pteval))
> - return false;
> - if (flags & FOLL_WRITE)
> - return true;
> - if (gup_must_unshare(vma, flags, pte_page(pteval))) {
> - *unshare = true;
> - return true;
> - }
> - return false;
> -}
> -
> struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> unsigned long address, unsigned int flags,
> unsigned int *page_mask)
> @@ -6518,198 +6487,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> return page;
> }
>
> -long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> - struct page **pages, unsigned long *position,
> - unsigned long *nr_pages, long i, unsigned int flags,
> - int *locked)
> -{
> - unsigned long pfn_offset;
> - unsigned long vaddr = *position;
> - unsigned long remainder = *nr_pages;
> - struct hstate *h = hstate_vma(vma);
> - int err = -EFAULT, refs;
> -
> - while (vaddr < vma->vm_end && remainder) {
> - pte_t *pte;
> - spinlock_t *ptl = NULL;
> - bool unshare = false;
> - int absent;
> - struct page *page;
> -
> - /*
> - * If we have a pending SIGKILL, don't keep faulting pages and
> - * potentially allocating memory.
> - */
> - if (fatal_signal_pending(current)) {
> - remainder = 0;
> - break;
> - }
> -
> - hugetlb_vma_lock_read(vma);
> - /*
> - * Some archs (sparc64, sh*) have multiple pte_ts to
> - * each hugepage. We have to make sure we get the
> - * first, for the page indexing below to work.
> - *
> - * Note that page table lock is not held when pte is null.
> - */
> - pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
> - huge_page_size(h));
> - if (pte)
> - ptl = huge_pte_lock(h, mm, pte);
> - absent = !pte || huge_pte_none(huge_ptep_get(pte));
> -
> - /*
> - * When coredumping, it suits get_dump_page if we just return
> - * an error where there's an empty slot with no huge pagecache
> - * to back it. This way, we avoid allocating a hugepage, and
> - * the sparse dumpfile avoids allocating disk blocks, but its
> - * huge holes still show up with zeroes where they need to be.
> - */
> - if (absent && (flags & FOLL_DUMP) &&
> - !hugetlbfs_pagecache_present(h, vma, vaddr)) {
> - if (pte)
> - spin_unlock(ptl);
> - hugetlb_vma_unlock_read(vma);
> - remainder = 0;
> - break;
> - }
> -
> - /*
> - * We need call hugetlb_fault for both hugepages under migration
> - * (in which case hugetlb_fault waits for the migration,) and
> - * hwpoisoned hugepages (in which case we need to prevent the
> - * caller from accessing to them.) In order to do this, we use
> - * here is_swap_pte instead of is_hugetlb_entry_migration and
> - * is_hugetlb_entry_hwpoisoned. This is because it simply covers
> - * both cases, and because we can't follow correct pages
> - * directly from any kind of swap entries.
> - */
> - if (absent ||
> - __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
> - vm_fault_t ret;
> - unsigned int fault_flags = 0;
> -
> - if (pte)
> - spin_unlock(ptl);
> - hugetlb_vma_unlock_read(vma);
> -
> - if (flags & FOLL_WRITE)
> - fault_flags |= FAULT_FLAG_WRITE;
> - else if (unshare)
> - fault_flags |= FAULT_FLAG_UNSHARE;
> - if (locked) {
> - fault_flags |= FAULT_FLAG_ALLOW_RETRY |
> - FAULT_FLAG_KILLABLE;
> - if (flags & FOLL_INTERRUPTIBLE)
> - fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
> - }
> - if (flags & FOLL_NOWAIT)
> - fault_flags |= FAULT_FLAG_ALLOW_RETRY |
> - FAULT_FLAG_RETRY_NOWAIT;
> - if (flags & FOLL_TRIED) {
> - /*
> - * Note: FAULT_FLAG_ALLOW_RETRY and
> - * FAULT_FLAG_TRIED can co-exist
> - */
> - fault_flags |= FAULT_FLAG_TRIED;
> - }
> - ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
> - if (ret & VM_FAULT_ERROR) {
> - err = vm_fault_to_errno(ret, flags);
> - remainder = 0;
> - break;
> - }
> - if (ret & VM_FAULT_RETRY) {
> - if (locked &&
> - !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
> - *locked = 0;
> - *nr_pages = 0;
> - /*
> - * VM_FAULT_RETRY must not return an
> - * error, it will return zero
> - * instead.
> - *
> - * No need to update "position" as the
> - * caller will not check it after
> - * *nr_pages is set to 0.
> - */
> - return i;
> - }
> - continue;
> - }
> -
> - pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
> - page = pte_page(huge_ptep_get(pte));
> -
> - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
> - !PageAnonExclusive(page), page);
> -
> - /*
> - * If subpage information not requested, update counters
> - * and skip the same_page loop below.
> - */
> - if (!pages && !pfn_offset &&
> - (vaddr + huge_page_size(h) < vma->vm_end) &&
> - (remainder >= pages_per_huge_page(h))) {
> - vaddr += huge_page_size(h);
> - remainder -= pages_per_huge_page(h);
> - i += pages_per_huge_page(h);
> - spin_unlock(ptl);
> - hugetlb_vma_unlock_read(vma);
> - continue;
> - }
> -
> - /* vaddr may not be aligned to PAGE_SIZE */
> - refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
> - (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
> -
> - if (pages)
> - record_subpages(nth_page(page, pfn_offset),
> - vma, refs,
> - likely(pages) ? pages + i : NULL);
> -
> - if (pages) {
> - /*
> - * try_grab_folio() should always succeed here,
> - * because: a) we hold the ptl lock, and b) we've just
> - * checked that the huge page is present in the page
> - * tables. If the huge page is present, then the tail
> - * pages must also be present. The ptl prevents the
> - * head page and tail pages from being rearranged in
> - * any way. As this is hugetlb, the pages will never
> - * be p2pdma or not longterm pinable. So this page
> - * must be available at this point, unless the page
> - * refcount overflowed:
> - */
> - if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
> - flags))) {
> - spin_unlock(ptl);
> - hugetlb_vma_unlock_read(vma);
> - remainder = 0;
> - err = -ENOMEM;
> - break;
> - }
> - }
> -
> - vaddr += (refs << PAGE_SHIFT);
> - remainder -= refs;
> - i += refs;
> -
> - spin_unlock(ptl);
> - hugetlb_vma_unlock_read(vma);
> - }
> - *nr_pages = remainder;
> - /*
> - * setting position is actually required only if remainder is
> - * not zero but it's faster not to add a "if (remainder)"
> - * branch.
> - */
> - *position = vaddr;
> -
> - return i ? i : err;
> -}
> -
> long hugetlb_change_protection(struct vm_area_struct *vma,
> unsigned long address, unsigned long end,
> pgprot_t newprot, unsigned long cp_flags)
> --
> 2.40.1
>

Absolutely wonderful to see such delightful code deletion :)

I haven't really dug into the huge page side of this in great detail, so I
can't give a meaningful tag, but I can certainly appreciate the code you're
removing here!