[RFC PATCH v2 25/47] hugetlb: add HGM support for copy_hugetlb_page_range

From: James Houghton
Date: Fri Oct 21 2022 - 12:39:35 EST


This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.

A page's reference count is incremented for *each* portion of it that is
mapped in the page table. For example, if you have a PMD-mapped 1G page,
the reference count and mapcount will be incremented by 512.

Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
mm/hugetlb.c | 81 +++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5783a8307a77..7d692907cbf3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4946,7 +4946,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry;
- struct page *ptepage;
+ struct hugetlb_pte src_hpte, dst_hpte;
+ struct page *ptepage, *hpage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -4956,6 +4957,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long last_addr_mask;
int ret = 0;

+ if (hugetlb_hgm_enabled(src_vma)) {
+ /*
+ * src_vma might have high-granularity PTEs, and dst_vma will
+ * need to copy those.
+ */
+ ret = enable_hugetlb_hgm(dst_vma);
+ if (ret)
+ return ret;
+ }
+
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
src_vma->vm_start,
@@ -4967,18 +4978,22 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* For shared mappings the vma lock must be held before
* calling huge_pte_offset in the src vma. Otherwise, the
- * returned ptep could go away if part of a shared pmd and
- * another thread calls huge_pmd_unshare.
+ * returned ptep could go away if
+ * - part of a shared pmd and another thread calls
+ * huge_pmd_unshare, or
+ * - another thread collapses a high-granularity mapping.
*/
hugetlb_vma_lock_read(src_vma);
}

last_addr_mask = hugetlb_mask_last_page(h);
- for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+ addr = src_vma->vm_start;
+ while (addr < src_vma->vm_end) {
spinlock_t *src_ptl, *dst_ptl;
+ unsigned long hpte_sz;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte) {
- addr |= last_addr_mask;
+ addr = (addr | last_addr_mask) + sz;
continue;
}
dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
@@ -4987,6 +5002,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}

+ hugetlb_pte_populate(&src_hpte, src_pte, huge_page_shift(h),
+ hpage_size_to_level(huge_page_size(h)));
+ hugetlb_pte_populate(&dst_hpte, dst_pte, huge_page_shift(h),
+ hpage_size_to_level(huge_page_size(h)));
+
+ if (hugetlb_hgm_enabled(src_vma)) {
+ hugetlb_hgm_walk(src, src_vma, &src_hpte, addr,
+ PAGE_SIZE, /*stop_at_none=*/true);
+ ret = hugetlb_hgm_walk(dst, dst_vma, &dst_hpte, addr,
+ hugetlb_pte_size(&src_hpte),
+ /*stop_at_none=*/false);
+ if (ret)
+ break;
+
+ src_pte = src_hpte.ptep;
+ dst_pte = dst_hpte.ptep;
+ }
+
+ hpte_sz = hugetlb_pte_size(&src_hpte);
+
/*
* If the pagetables are shared don't copy or take references.
*
@@ -4996,12 +5031,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* to reliably determine whether pte is shared.
*/
if (page_count(virt_to_page(dst_pte)) > 1) {
- addr |= last_addr_mask;
+ addr = (addr | last_addr_mask) + sz;
continue;
}

- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+ dst_ptl = hugetlb_pte_lock(dst, &dst_hpte);
+ src_ptl = hugetlb_pte_lockptr(src, &src_hpte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
again:
@@ -5042,10 +5077,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
if (userfaultfd_wp(dst_vma))
set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
+ /* Retry the walk. */
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
} else {
- entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
- get_page(ptepage);
+ hpage = compound_head(ptepage);
+ get_page(hpage);

/*
* Failing to duplicate the anon rmap is a rare case
@@ -5058,24 +5098,29 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* sleep during the process.
*/
if (!PageAnon(ptepage)) {
- page_dup_file_rmap(ptepage, true);
- } else if (page_try_dup_anon_rmap(ptepage, true,
+ page_dup_file_rmap(hpage, true);
+ } else if (page_try_dup_anon_rmap(hpage, true,
src_vma)) {
pte_t src_pte_old = entry;
struct page *new;

+ if (hugetlb_hgm_enabled(src_vma)) {
+ ret = -EINVAL;
+ break;
+ }
+
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
new = alloc_huge_page(dst_vma, addr, 1);
if (IS_ERR(new)) {
- put_page(ptepage);
+ put_page(hpage);
ret = PTR_ERR(new);
break;
}
- copy_user_huge_page(new, ptepage, addr, dst_vma,
+ copy_user_huge_page(new, hpage, addr, dst_vma,
npages);
- put_page(ptepage);
+ put_page(hpage);

/* Install the new huge page if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5093,6 +5138,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
hugetlb_install_page(dst_vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
continue;
}

@@ -5109,10 +5155,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}

set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(npages, dst);
+ hugetlb_count_add(
+ hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+ dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
}

if (cow) {
--
2.38.0.135.g90850a2211-goog