[RFC PATCH v1 1/7] hugetlb: add HugeTLB splitting functionality

From: Jiaqi Yan
Date: Thu Apr 27 2023 - 20:41:53 EST


The new function, hugetlb_split_to_shift, optimally splits the page
table to map a particular address at a paricular granularity. This
is useful for punching a hole in the mapping and for mapping (and
unmapping) small sections of a HugeTLB page.

Splitting is for present leaf HugeTLB PTE only. None HugeTLB PTEs
and other non-present HugeTLB PTE types are not supported as they
are better left untouched:
* None PTEs
* Migration PTEs
* HWPOISON PTEs
* UFFD writeprotect PTEs

Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx>
---
include/linux/hugetlb.h | 9 ++
mm/hugetlb.c | 249 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 258 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 742e7f2cb170..d44bf6a794e5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1266,6 +1266,9 @@ int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
unsigned long end);
int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
unsigned long end);
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct hugetlb_pte *hpte, unsigned long addr,
+ unsigned int desired_shift);
#else
static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
{
@@ -1292,6 +1295,12 @@ int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
{
return -EINVAL;
}
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+ const struct hugetlb_pte *hpte, unsigned long addr,
+ unsigned int desired_shift)
+{
+ return -EINVAL;
+}
#endif

static inline
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df4c17164abb..d3f3f1c2d293 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -8203,6 +8203,255 @@ int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
return ret;
}

+/*
+ * Find the optimal HugeTLB PTE shift that @desired_addr could be mapped at.
+ */
+static int hugetlb_find_shift(struct vm_area_struct *vma,
+ unsigned long curr,
+ unsigned long end,
+ unsigned long desired_addr,
+ unsigned long desired_shift,
+ unsigned int *shift_found)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct hstate *tmp_h;
+ unsigned int shift;
+ unsigned long sz;
+
+ for_each_hgm_shift(h, tmp_h, shift) {
+ sz = 1UL << shift;
+ /* This sz is not aligned or too large. */
+ if (!IS_ALIGNED(curr, sz) || curr + sz > end)
+ continue;
+ /*
+ * When desired_addr is in [curr, curr + sz),
+ * we want shift to be as close to desired_shift
+ * as possible.
+ */
+ if (curr <= desired_addr && desired_addr < curr + sz
+ && shift > desired_shift)
+ continue;
+
+ *shift_found = shift;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Given a particular address @addr and it is a present leaf HugeTLB PTE,
+ * split it so that the PTE that maps @addr is at @desired_shift.
+ */
+static int hugetlb_split_to_shift_present_leaf(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t old_entry,
+ unsigned long start,
+ unsigned long end,
+ unsigned long addr,
+ unsigned int orig_shift,
+ unsigned int desired_shift)
+{
+ bool old_entry_dirty;
+ bool old_entry_write;
+ bool old_entry_uffd_wp;
+ pte_t new_entry;
+ unsigned long curr;
+ unsigned long sz;
+ unsigned int shift;
+ int ret = 0;
+ struct hugetlb_pte new_hpte;
+ struct page *subpage = NULL;
+ struct folio *folio = page_folio(compound_head(pte_page(old_entry)));
+ struct hstate *h = hstate_vma(vma);
+ spinlock_t *ptl;
+
+ /* Unmap original unsplit hugepage per huge_ptep_get_and_clear. */
+ hugetlb_remove_rmap(folio_page(folio, 0), orig_shift, h, vma);
+
+ old_entry_dirty = huge_pte_dirty(old_entry);
+ old_entry_write = huge_pte_write(old_entry);
+ old_entry_uffd_wp = huge_pte_uffd_wp(old_entry);
+
+ for (curr = start; curr < end; curr += sz) {
+ ret = hugetlb_find_shift(vma, curr, end, addr,
+ desired_shift, &shift);
+
+ /* Unable to find a shift that works */
+ if (WARN_ON(ret))
+ goto abort;
+
+ /*
+ * Do HGM full walk and allocate new page table structures
+ * to continue to walk to the level we want.
+ */
+ sz = 1UL << shift;
+ ret = hugetlb_full_walk_alloc(&new_hpte, vma, curr, sz);
+ if (WARN_ON(ret))
+ goto abort;
+
+ BUG_ON(hugetlb_pte_size(&new_hpte) > sz);
+ /*
+ * When hugetlb_pte_size(new_hpte) is than sz, increment
+ * curr by hugetlb_pte_size(new_hpte) to avoid skip over
+ * some PTEs.
+ */
+ if (hugetlb_pte_size(&new_hpte) < sz)
+ sz = hugetlb_pte_size(&new_hpte);
+
+ subpage = hugetlb_find_subpage(h, folio, curr);
+ /*
+ * Creating a new (finer granularity) PT entry and
+ * populate it with old_entry's bits.
+ */
+ new_entry = make_huge_pte(vma, subpage,
+ huge_pte_write(old_entry), shift);
+ if (old_entry_dirty)
+ new_entry = huge_pte_mkdirty(new_entry);
+ if (old_entry_write)
+ new_entry = huge_pte_mkwrite(new_entry);
+ if (old_entry_uffd_wp)
+ new_entry = huge_pte_mkuffd_wp(new_entry);
+ ptl = hugetlb_pte_lock(&new_hpte);
+ set_huge_pte_at(mm, curr, new_hpte.ptep, new_entry);
+ spin_unlock(ptl);
+ /* Increment ref/mapcount per set_huge_pte_at(). */
+ hugetlb_add_file_rmap(subpage, shift, h, vma);
+ folio_get(folio);
+ }
+ /*
+ * This refcount decrement is for the huge_ptep_get_and_clear
+ * on the hpte BEFORE splitting, for the same reason as
+ * hugetlb_remove_rmap(), but we cannot do it at that time.
+ * Now that splitting succeeded, the refcount can be decremented.
+ */
+ folio_put(folio);
+ return 0;
+abort:
+ /*
+ * Restore mapcount on unsplitted hugepage. No need to restore
+ * refcount as we won't folio_put() until splitting succeeded.
+ */
+ hugetlb_add_file_rmap(folio_page(folio, 0), orig_shift, h, vma);
+ return ret;
+}
+
+/*
+ * Given a particular address @addr, split the HugeTLB PTE that currently
+ * maps it so that, for the given @addr, the PTE that maps it is @desired_shift.
+ * The splitting is always done optimally.
+ *
+ * Example: given a HugeTLB 1G page mapped from VA 0 to 1G, if caller calls
+ * this API with addr=0 and desired_shift=PAGE_SHIFT, we will change the page
+ * table as follows:
+ * 1. The original PUD will be split into 512 2M PMDs first
+ * 2. The 1st PMD will further be split into 512 4K PTEs
+ *
+ * Callers are required to hold locks on the file mapping within vma.
+ */
+int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct hugetlb_pte *hpte, unsigned long addr,
+ unsigned int desired_shift)
+{
+ unsigned long start, end;
+ unsigned long desired_sz = 1UL << desired_shift;
+ int ret;
+ pte_t old_entry;
+ struct mmu_gather tlb;
+ struct mmu_notifier_range range;
+ spinlock_t *ptl;
+
+ BUG_ON(!hpte->ptep);
+
+ start = addr & hugetlb_pte_mask(hpte);
+ end = start + hugetlb_pte_size(hpte);
+ BUG_ON(!IS_ALIGNED(start, desired_sz));
+ BUG_ON(!IS_ALIGNED(end, desired_sz));
+ BUG_ON(addr < start || end <= addr);
+
+ if (hpte->shift == desired_shift)
+ return 0;
+
+ /*
+ * Non none-mostly hugetlb PTEs must be present leaf-level PTE,
+ * i.e. not split before.
+ */
+ ptl = hugetlb_pte_lock(hpte);
+ BUG_ON(!huge_pte_none_mostly(huge_ptep_get(hpte->ptep)) &&
+ !hugetlb_pte_present_leaf(hpte, huge_ptep_get(hpte->ptep)));
+
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end);
+ mmu_notifier_invalidate_range_start(&range);
+
+ /*
+ * Get and clear the PTE. We will allocate new page table structures
+ * when walking the page table.
+ */
+ old_entry = huge_ptep_get_and_clear(mm, start, hpte->ptep);
+ spin_unlock(ptl);
+
+ /*
+ * From now on, any failure exit needs to go through "skip" to
+ * put old_entry back. If any form of hugetlb_split_to_shift_xxx
+ * is invoked, it also needs to go through "abort" to get rid of
+ * the allocated PTEs created before splitting fails.
+ */
+
+ if (unlikely(huge_pte_none_mostly(old_entry))) {
+ ret = -EAGAIN;
+ goto skip;
+ }
+ if (unlikely(!pte_present(old_entry))) {
+ if (is_hugetlb_entry_migration(old_entry))
+ ret = -EBUSY;
+ else if (is_hugetlb_entry_hwpoisoned(old_entry))
+ ret = -EHWPOISON;
+ else {
+ WARN_ONCE(1, "Unexpected case of non-present HugeTLB PTE\n");
+ ret = -EINVAL;
+ }
+ goto skip;
+ }
+
+ if (!hugetlb_pte_present_leaf(hpte, old_entry)) {
+ WARN_ONCE(1, "HugeTLB present PTE is not leaf\n");
+ ret = -EAGAIN;
+ goto skip;
+ }
+ /* From now on old_entry is present leaf entry. */
+ ret = hugetlb_split_to_shift_present_leaf(mm, vma, old_entry,
+ start, end, addr,
+ hpte->shift,
+ desired_shift);
+ if (ret)
+ goto abort;
+
+ /* Splitting done, new page table entries successfully setup. */
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
+abort:
+ /* Splitting failed, restoring to the original page table state. */
+ tlb_gather_mmu(&tlb, mm);
+ /* Decrement mapcount for all the split PTEs. */
+ __unmap_hugepage_range(&tlb, vma, start, end, NULL, ZAP_FLAG_DROP_MARKER);
+ /*
+ * Free any newly allocated page table entries.
+ * Ok if no new entries allocated at all.
+ */
+ hugetlb_free_pgd_range(&tlb, start, end, start, end);
+ /* Decrement refcount for all the split PTEs. */
+ tlb_finish_mmu(&tlb);
+skip:
+ /* Restore the old entry. */
+ ptl = hugetlb_pte_lock(hpte);
+ set_huge_pte_at(mm, start, hpte->ptep, old_entry);
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ return ret;
+}
+
#endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */

/*
--
2.40.1.495.gc816e09b53d-goog