[PATCH 1/5] hugetlb: Convert hugetlb_fault() to use struct vm_fault

From: Vishal Moola (Oracle)
Date: Mon Mar 25 2024 - 18:35:01 EST


Now that hugetlb_fault() has a vm_fault available for fault tracking, use
it throughout. This cleans up the code by removing 2 variables, and
prepares hugetlb_fault() to take in a struct vm_fault argument.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@xxxxxxxxx>
---
mm/hugetlb.c | 84 +++++++++++++++++++++++++---------------------------
1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 688017ca0cc2..81e8ade53b64 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6330,8 +6330,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t *ptep, entry;
- spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
@@ -6339,13 +6337,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
- unsigned long haddr = address & huge_page_mask(h);
struct vm_fault vmf = {
.vma = vma,
- .address = haddr,
+ .address = address & huge_page_mask(h),
.real_address = address,
.flags = flags,
- .pgoff = vma_hugecache_offset(h, vma, haddr),
+ .pgoff = vma_hugecache_offset(h, vma,
+ address & huge_page_mask(h)),
/* TODO: Track hugetlb faults using vm_fault */

/*
@@ -6365,22 +6363,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

/*
* Acquire vma lock before calling huge_pte_alloc and hold
- * until finished with ptep. This prevents huge_pmd_unshare from
- * being called elsewhere and making the ptep no longer valid.
+ * until finished with vmf.pte. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the vmf.pte no longer valid.
*/
hugetlb_vma_lock_read(vma);
- ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
- if (!ptep) {
+ vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
+ if (!vmf.pte) {
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}

- entry = huge_ptep_get(ptep);
- if (huge_pte_none_mostly(entry)) {
- if (is_pte_marker(entry)) {
+ vmf.orig_pte = huge_ptep_get(vmf.pte);
+ if (huge_pte_none_mostly(vmf.orig_pte)) {
+ if (is_pte_marker(vmf.orig_pte)) {
pte_marker marker =
- pte_marker_get(pte_to_swp_entry(entry));
+ pte_marker_get(pte_to_swp_entry(vmf.orig_pte));

if (marker & PTE_MARKER_POISONED) {
ret = VM_FAULT_HWPOISON_LARGE;
@@ -6395,20 +6393,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
- ptep, entry, flags, &vmf);
+ vmf.pte, vmf.orig_pte, flags, &vmf);
}

ret = 0;

/*
- * entry could be a migration/hwpoison entry at this point, so this
- * check prevents the kernel from going below assuming that we have
- * an active hugepage in pagecache. This goto expects the 2nd page
- * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
- * properly handle it.
+ * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
+ * point, so this check prevents the kernel from going below assuming
+ * that we have an active hugepage in pagecache. This goto expects
+ * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
+ * check will properly handle it.
*/
- if (!pte_present(entry)) {
- if (unlikely(is_hugetlb_entry_migration(entry))) {
+ if (!pte_present(vmf.orig_pte)) {
+ if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6417,9 +6415,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* be released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- migration_entry_wait_huge(vma, ptep);
+ migration_entry_wait_huge(vma, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6433,13 +6431,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* determine if a reservation has been consumed.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
- !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
- if (vma_needs_reservation(h, vma, haddr) < 0) {
+ !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
+ if (vma_needs_reservation(h, vma, vmf.address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, haddr);
+ vma_end_reservation(h, vma, vmf.address);

pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
vmf.pgoff);
@@ -6447,17 +6445,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pagecache_folio = NULL;
}

- ptl = huge_pte_lock(h, mm, ptep);
+ vmf.ptl = huge_pte_lock(h, mm, vmf.pte);

/* Check for a racing update before calling hugetlb_wp() */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
goto out_ptl;

/* Handle userfault-wp first, before trying to lock more pages */
- if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
- (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
folio_put(pagecache_folio);
@@ -6467,18 +6465,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_userfault(&vmf, VM_UFFD_WP);
}

- entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(mm, haddr, ptep, entry,
+ vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
+ set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
huge_page_size(hstate_vma(vma)));
/* Fallthrough to CoW */
}

/*
- * hugetlb_wp() requires page locks of pte_page(entry) and
+ * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
* pagecache_folio, so here we need take the former one
* when folio != pagecache_folio or !pagecache_folio.
*/
- folio = page_folio(pte_page(entry));
+ folio = page_folio(pte_page(vmf.orig_pte));
if (folio != pagecache_folio)
if (!folio_trylock(folio)) {
need_wait_lock = 1;
@@ -6488,24 +6486,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
folio_get(folio);

if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
- if (!huge_pte_write(entry)) {
- ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_folio, ptl, &vmf);
+ if (!huge_pte_write(vmf.orig_pte)) {
+ ret = hugetlb_wp(mm, vma, address, vmf.pte, flags,
+ pagecache_folio, vmf.ptl, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
- entry = huge_pte_mkdirty(entry);
+ vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
}
- entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+ vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
+ if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, haddr, ptep);
+ update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
if (folio != pagecache_folio)
folio_unlock(folio);
folio_put(folio);
out_ptl:
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);

if (pagecache_folio) {
folio_unlock(pagecache_folio);
--
2.43.0