[PATCH v2] hugetlb: unshare some PMDs when splitting VMAs

From: James Houghton
Date: Wed Jan 04 2023 - 18:26:04 EST


PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs;
however, it is possible that HugeTLB VMAs are split without unsharing
the PMDs first.

Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE
in hugetlb_change_protection [1]. The key there is that
hugetlb_unshare_all_pmds will not attempt to unshare PMDs in
non-PUD_SIZE-aligned sections of the VMA.

It might seem ideal to unshare in hugetlb_vm_op_open, but we need to
unshare in both the new and old VMAs, so unsharing in
hugetlb_vm_op_split seems natural.

[1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@xxxxxxxxxxxxxx/

Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp")
Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++---------
1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b39b74e0591a..b6976da0fa4d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -4828,6 +4830,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+
+ /*
+ * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ */
+ if (addr & ~PUD_MASK) {
+ /*
+ * hugetlb_vm_op_split is called right before we attempt to
+ * split the VMA. We will need to unshare PMDs in the old and
+ * new VMAs, so let's unshare before we split.
+ */
+ unsigned long floor = addr & PUD_MASK;
+ unsigned long ceil = floor + PUD_SIZE;
+
+ if (floor >= vma->vm_start && ceil <= vma->vm_end)
+ hugetlb_unshare_pmds(vma, floor, ceil);
+ }
+
return 0;
}

@@ -7313,26 +7334,21 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
}

-/*
- * This function will unconditionally remove all the shared pmd pgtable entries
- * within the specific vma for a hugetlbfs memory range.
- */
-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long address, start, end;
+ unsigned long address;
spinlock_t *ptl;
pte_t *ptep;

if (!(vma->vm_flags & VM_MAYSHARE))
return;

- start = ALIGN(vma->vm_start, PUD_SIZE);
- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
-
if (start >= end)
return;

@@ -7364,6 +7380,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_invalidate_range_end(&range);
}

+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+}
+
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;

--
2.39.0.314.g84b9a713c41-goog