[RFC PATCH 13/31] mm: thp: 1GB THP copy on write implementation.

From: Zi Yan
Date: Fri Feb 15 2019 - 17:11:02 EST


From: Zi Yan <ziy@xxxxxxxxxx>

COW on 1GB THPs will fall back to 2MB THPs if 1GB THP is not available.

Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
arch/x86/include/asm/pgalloc.h | 9 +
include/linux/huge_mm.h | 5 +
mm/huge_memory.c | 319 ++++++++++++++++++++++++++++++++-
mm/memory.c | 2 +-
4 files changed, 331 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 6e29ad9b9d7f..ebcb022f6bb9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -110,6 +110,15 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,

#define pmd_pgtable(pmd) pmd_page(pmd)

+static inline void pud_populate_with_pgtable(struct mm_struct *mm, pud_t *pud,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pmd(mm, pfn);
+ set_pud(pud, __pud(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c6272e6ffc35..02419fa91e12 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,7 @@ extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
extern int do_huge_pud_anonymous_page(struct vm_fault *vmf);
+extern int do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
@@ -27,6 +28,10 @@ extern int do_huge_pud_anonymous_page(struct vm_fault *vmf)
{
return VM_FAULT_FALLBACK;
}
+extern int do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud)
+{
+ return VM_FAULT_FALLBACK;
+}
#endif

extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cad4ef01f607..0a006592f3fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1284,7 +1284,12 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
{
spinlock_t *dst_ptl, *src_ptl;
pud_t pud;
- int ret;
+ pmd_t *pmd_pgtable = NULL;
+ int ret = -ENOMEM;
+
+ pmd_pgtable = pmd_alloc_one_page_with_ptes(vma->vm_mm, addr);
+ if (unlikely(!pmd_pgtable))
+ goto out;

dst_ptl = pud_lock(dst_mm, dst_pud);
src_ptl = pud_lockptr(src_mm, src_pud);
@@ -1292,8 +1297,13 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,

ret = -EAGAIN;
pud = *src_pud;
- if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) {
+ pmd_free_page_with_ptes(dst_mm, pmd_pgtable);
goto out_unlock;
+ }
+
+ if (pud_devmap(pud))
+ pmd_free_page_with_ptes(dst_mm, pmd_pgtable);

/*
* When page table lock is held, the huge zero pud should not be
@@ -1301,7 +1311,32 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pud(pud)) {
- /* No huge zero pud yet */
+ struct page *zero_page;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_page = mm_get_huge_pud_zero_page(dst_mm);
+ set_huge_pud_zero_page(virt_to_page(pmd_pgtable),
+ dst_mm, vma, addr, dst_pud, zero_page);
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (pud_trans_huge(pud)) {
+ struct page *src_page;
+ int i;
+
+ src_page = pud_page(pud);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PUD_NR);
+ mm_inc_nr_pmds(dst_mm);
+ for (i = 0; i < (1<<(HPAGE_PUD_ORDER - HPAGE_PMD_ORDER)); i++)
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_pud_deposit(dst_mm, dst_pud, virt_to_page(pmd_pgtable));
}

pudp_set_wrprotect(src_mm, addr, src_pud);
@@ -1312,6 +1347,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
out_unlock:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+out:
return ret;
}

@@ -1335,6 +1371,283 @@ void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
unlock:
spin_unlock(vmf->ptl);
}
+
+static int do_huge_pud_wp_page_fallback(struct vm_fault *vmf, pud_t orig_pud,
+ struct page *page)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PUD_MASK;
+ struct mem_cgroup *memcg;
+ pgtable_t pgtable, pmd_pgtable;
+ pud_t _pud;
+ int ret = 0, i, j;
+ struct page **pages;
+ struct mmu_notifier_range range;
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PUD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ pmd_pgtable = pte_alloc_order(vma->vm_mm, haddr,
+ HPAGE_PUD_ORDER - HPAGE_PMD_ORDER);
+ if (!pmd_pgtable) {
+ ret |= VM_FAULT_OOM;
+ goto out_kfree_pages;
+ }
+
+ for (i = 0; i < (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER)); i++) {
+ pages[i] = alloc_page_vma_node(GFP_TRANSHUGE, vma,
+ vmf->address, page_to_nid(page));
+ if (unlikely(!pages[i] ||
+ mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ GFP_KERNEL, &memcg, true))) {
+ if (pages[i])
+ put_page(pages[i]);
+ while (--i >= 0) {
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ true);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ pte_free_order(vma->vm_mm, pmd_pgtable,
+ HPAGE_PMD_ORDER - HPAGE_PMD_ORDER);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
+ set_page_private(pages[i], (unsigned long)memcg);
+ prep_transhuge_page(pages[i]);
+ }
+
+ for (i = 0; i < (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER)); i++) {
+ for (j = 0; j < HPAGE_PMD_NR; j++) {
+ copy_user_highpage(pages[i] + j, page + i * HPAGE_PMD_NR + j,
+ haddr + PAGE_SIZE * (i * HPAGE_PMD_NR + j), vma);
+ cond_resched();
+ }
+ __SetPageUptodate(pages[i]);
+ }
+
+ mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+ haddr + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->ptl = pud_lock(vma->vm_mm, vmf->pud);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+ goto out_free_pages;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ /*
+ * Leave pmd empty until pte is filled note we must notify here as
+ * concurrent CPU thread might write to new page before the call to
+ * mmu_notifier_invalidate_range_end() happens which can lead to a
+ * device seeing memory write in different order than CPU.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
+
+ pgtable = pgtable_trans_huge_pud_withdraw(vma->vm_mm, vmf->pud);
+ pud_populate_with_pgtable(vma->vm_mm, &_pud, pgtable);
+
+ for (i = 0; i < (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER));
+ i++, haddr += (PAGE_SIZE * HPAGE_PMD_NR)) {
+ pmd_t entry;
+
+ entry = mk_huge_pmd(pages[i], vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, true);
+ mem_cgroup_commit_charge(pages[i], memcg, false, true);
+ lru_cache_add_active_or_unevictable(pages[i], vma);
+ vmf->pmd = pmd_offset(&_pud, haddr);
+ VM_BUG_ON(!pmd_none(*vmf->pmd));
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, &pmd_pgtable[i]);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ }
+ kfree(pages);
+
+ smp_wmb(); /* make pte visible before pmd */
+ pud_populate_with_pgtable(vma->vm_mm, vmf->pud, pgtable);
+ page_remove_rmap(page, true);
+ spin_unlock(vmf->ptl);
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ for (i = 0; i < (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER)); i++) {
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
+ mem_cgroup_cancel_charge(pages[i], memcg, true);
+ put_page(pages[i]);
+ }
+out_kfree_pages:
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pud_wp_page(struct vm_fault *vmf, pud_t orig_pud)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = NULL, *new_page;
+ struct mem_cgroup *memcg;
+ unsigned long haddr = vmf->address & HPAGE_PUD_MASK;
+ struct mmu_notifier_range range;
+ gfp_t huge_gfp; /* for allocation and charge */
+ int ret = 0;
+
+ vmf->ptl = pud_lockptr(vma->vm_mm, vmf->pud);
+ VM_BUG_ON_VMA(!vma->anon_vma, vma);
+ if (is_huge_zero_pud(orig_pud))
+ goto alloc;
+ spin_lock(vmf->ptl);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+ goto out_unlock;
+
+ page = pud_page(orig_pud);
+ VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ put_page(page);
+ }
+ if (reuse_swap_page(page, NULL)) {
+ pud_t entry;
+
+ entry = pud_mkyoung(orig_pud);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ if (pudp_set_access_flags(vma, haddr, vmf->pud, entry, 1))
+ update_mmu_cache_pud(vma, vmf->address, vmf->pud);
+ ret |= VM_FAULT_WRITE;
+ unlock_page(page);
+ goto out_unlock;
+ }
+ unlock_page(page);
+ get_page(page);
+ spin_unlock(vmf->ptl);
+alloc:
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow()) {
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PUD_ORDER);
+ } else
+ new_page = NULL;
+
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
+ if (!page) {
+ WARN(1, "%s: split_huge_page\n", __func__);
+ split_huge_pud(vma, vmf->pud, vmf->address);
+ ret |= VM_FAULT_FALLBACK;
+ } else {
+ ret = do_huge_pud_wp_page_fallback(vmf, orig_pud, page);
+ if (ret & VM_FAULT_OOM) {
+ WARN(1, "%s: split_huge_page after wp fallback\n", __func__);
+ split_huge_pud(vma, vmf->pud, vmf->address);
+ ret |= VM_FAULT_FALLBACK;
+ }
+ put_page(page);
+ }
+ count_vm_event(THP_FAULT_FALLBACK_PUD);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ huge_gfp, &memcg, true))) {
+ put_page(new_page);
+ WARN(1, "%s: split_huge_page after mem cgroup failed\n", __func__);
+ split_huge_pud(vma, vmf->pud, vmf->address);
+ if (page)
+ put_page(page);
+ ret |= VM_FAULT_FALLBACK;
+ count_vm_event(THP_FAULT_FALLBACK_PUD);
+ goto out;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC_PUD);
+
+ if (!page)
+ clear_huge_page(new_page, vmf->address, HPAGE_PUD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PUD_NR);
+ __SetPageUptodate(new_page);
+
+ mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+ haddr + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ spin_lock(vmf->ptl);
+ if (page)
+ put_page(page);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud))) {
+ spin_unlock(vmf->ptl);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ put_page(new_page);
+ goto out_mn;
+ } else {
+ pud_t entry;
+
+ entry = mk_huge_pud(new_page, vma->vm_page_prot);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ pudp_huge_clear_flush_notify(vma, haddr, vmf->pud);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ set_pud_at(vma->vm_mm, haddr, vmf->pud, entry);
+ update_mmu_cache_pud(vma, vmf->address, vmf->pud);
+ if (!page) {
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PUD_NR);
+ } else {
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
+ ret |= VM_FAULT_WRITE;
+ }
+ spin_unlock(vmf->ptl);
+out_mn:
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+out:
+ return ret;
+out_unlock:
+ spin_unlock(vmf->ptl);
+ return ret;
+}
+
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
diff --git a/mm/memory.c b/mm/memory.c
index 177478d5ee47..3608b5436519 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3722,7 +3722,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
- return VM_FAULT_FALLBACK;
+ return do_huge_pud_wp_page(vmf, orig_pud);
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
2.20.1