[PATCH] thp: Simplify splitting PMD mapping huge zero page

From: Kirill A. Shutemov
Date: Fri Mar 27 2020 - 13:03:56 EST


Splitting PMD mapping huge zero page can be simplified a lot: we can
just unmap it and fallback to PTE handling.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
---
mm/huge_memory.c | 57 ++++--------------------------------------------
1 file changed, 4 insertions(+), 53 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42407e16bd80..ef6a6bcb291f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

-static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd)
-{
- struct mm_struct *mm = vma->vm_mm;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
-
- /*
- * Leave pmd empty until pte is filled note that it is fine to delay
- * notification until mmu_notifier_invalidate_range_end() as we are
- * replacing a zero pmd write protected page with a zero pte write
- * protected page.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush(vma, haddr, pmd);
-
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
- entry = pte_mkspecial(entry);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(mm, pmd, pgtable);
-}
-
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long haddr, bool freeze)
{
@@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,

count_vm_event(THP_SPLIT_PMD);

- if (!vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
@@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_dax(vma))
+ if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
return;
page = pmd_page(_pmd);
if (!PageDirty(page) && pmd_dirty(_pmd))
@@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (is_huge_zero_pmd(*pmd)) {
- /*
- * FIXME: Do we want to invalidate secondary mmu by calling
- * mmu_notifier_invalidate_range() see comments below inside
- * __split_huge_pmd() ?
- *
- * We are going from a zero huge page write protected to zero
- * small page also write protected so it does not seems useful
- * to invalidate secondary mmu at this time.
- */
- return __split_huge_zero_page_pmd(vma, haddr, pmd);
}

/*
@@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spin_unlock(ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
- * They are 3 cases to consider inside __split_huge_pmd_locked():
+ * They are 2 cases to consider inside __split_huge_pmd_locked():
* 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
- * 2) __split_huge_zero_page_pmd() read only zero page and any write
- * fault will trigger a flush_notify before pointing to a new page
- * (it is fine if the secondary mmu keeps pointing to the old zero
- * page in the meantime)
- * 3) Split a huge pmd into pte pointing to the same page. No need
+ * 2) Split a huge pmd into pte pointing to the same page. No need
* to invalidate secondary tlb entry they are all still valid.
* any further changes to individual pte will notify. So no need
* to call mmu_notifier->invalidate_range()
--
2.26.0