[RFC PATCH 26/31] mm: thp: promote PTE-mapped THP to PMD-mapped THP.

From: Zi Yan
Date: Fri Feb 15 2019 - 17:10:12 EST


From: Zi Yan <ziy@xxxxxxxxxx>

First promote 512 base pages to a PTE-mapped THP, then promote the
PTE-mapped THP to a PMD-mapped THP.

Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
include/linux/khugepaged.h | 1 +
mm/filemap.c | 8 +
mm/huge_memory.c | 419 +++++++++++++++++++++++++++++++++++++
mm/internal.h | 6 +
mm/khugepaged.c | 2 +-
5 files changed, 435 insertions(+), 1 deletion(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 082d1d2a5216..675c5ee99698 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -55,6 +55,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
return -ENOMEM;
return 0;
}
+void release_pte_pages(pte_t *pte, pte_t *_pte);
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f5e323e883e..54babad945ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1236,6 +1236,14 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem

#endif

+void __unlock_page(struct page *page)
+{
+ BUILD_BUG_ON(PG_waiters != 7);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+ wake_up_page_bit(page, PG_locked);
+}
+
/**
* unlock_page - unlock a locked page
* @page: the page
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fa3e12b17621..f856f7e39095 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -4284,3 +4284,422 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
#endif
+
+/* promote HPAGE_PMD_SIZE range into a PMD map.
+ * mmap_sem needs to be down_write.
+ */
+int promote_huge_pmd_address(struct vm_area_struct *vma, unsigned long haddr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ pte_t *pte, *_pte;
+ spinlock_t *pmd_ptl, *pte_ptl;
+ struct mmu_notifier_range range;
+ pgtable_t pgtable;
+ struct page *page, *head;
+ unsigned long address = haddr;
+ int ret = -EBUSY;
+
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+
+ if (haddr < vma->vm_start || (haddr + HPAGE_PMD_SIZE) > vma->vm_end)
+ return -EINVAL;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, haddr);
+ pte_ptl = pte_lockptr(mm, pmd);
+
+ head = page = vm_normal_page(vma, haddr, *pte);
+ if (!page || !PageTransCompound(page))
+ goto out_unlock;
+ VM_BUG_ON(page != compound_head(page));
+ lock_page(head);
+
+ mmu_notifier_range_init(&range, mm, haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ spin_unlock(pmd_ptl);
+ mmu_notifier_invalidate_range_end(&range);
+
+ /* remove ptes */
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, page++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ pr_err("pte none or zero pfn during pmd promotion\n");
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(pte_ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(pte_ptl);
+ }
+ } else {
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(pte_ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ atomic_dec(&page->_mapcount);
+ /*page_remove_rmap(page, false, 0);*/
+ if (atomic_read(&page->_mapcount) > -1) {
+ SetPageDoubleMap(head);
+ pr_info("page double mapped");
+ }
+ spin_unlock(pte_ptl);
+ }
+ }
+ page_ref_sub(head, HPAGE_PMD_NR - 1);
+
+ pte_unmap(pte);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(head, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(pmd_ptl);
+ VM_BUG_ON(!pmd_none(*pmd));
+ atomic_inc(compound_mapcount_ptr(head));
+ __inc_node_page_state(head, NR_ANON_THPS);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, haddr, pmd, _pmd);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ spin_unlock(pmd_ptl);
+ unlock_page(head);
+ ret = 0;
+
+out_unlock:
+ anon_vma_unlock_write(vma->anon_vma);
+out:
+ return ret;
+}
+
+/* Racy check whether the huge page can be split */
+static bool can_promote_huge_page(struct page *page)
+{
+ int extra_pins;
+
+ /* Additional pins from radix tree */
+ if (PageAnon(page))
+ extra_pins = PageSwapCache(page) ? 1 : 0;
+ else
+ return false;
+ if (PageSwapCache(page))
+ return false;
+ if (PageWriteback(page))
+ return false;
+ return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
+/* write a __promote_huge_page_isolate(struct vm_area_struct *vma,
+ * unsigned long address, pte_t *pte) to isolate all subpages into a list,
+ * then call promote_list_to_huge_page() to promote in-place
+ */
+
+static int __promote_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long haddr, pte_t *pte,
+ struct page **head, struct list_head *subpage_list)
+{
+ struct page *page = NULL;
+ pte_t *_pte;
+ bool writable = false;
+ unsigned long address = haddr;
+
+ *head = NULL;
+ lru_add_drain();
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+
+ if (pte_none(pteval) || (pte_present(pteval) &&
+ is_zero_pfn(pte_pfn(pteval))))
+ goto out;
+ if (!pte_present(pteval))
+ goto out;
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page))
+ goto out;
+
+ if (address == haddr) {
+ *head = page;
+ if (page_to_pfn(page) & ((1<<HPAGE_PMD_ORDER) - 1))
+ goto out;
+ }
+
+ if ((*head + (address - haddr)/PAGE_SIZE) != page)
+ goto out;
+
+ if (PageCompound(page))
+ goto out;
+
+ if (PageMlocked(page))
+ goto out;
+
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != page_mapcount(page) + PageSwapCache(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ if (pte_write(pteval)) {
+ writable = true;
+ } else {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
+ unlock_page(page);
+ goto out;
+ }
+ /*
+ * Page is not in the swap cache. It can be collapsed
+ * into a THP.
+ */
+ }
+
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ }
+ if (likely(writable)) {
+ int i;
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *p = *head + i;
+
+ list_add_tail(&p->lru, subpage_list);
+ VM_BUG_ON_PAGE(!PageLocked(p), p);
+ }
+ return 1;
+ } else {
+ /*result = SCAN_PAGE_RO;*/
+ }
+
+out:
+ release_pte_pages(pte, _pte);
+ return 0;
+}
+
+/*
+ * This function promotes normal pages into a huge page. @list point to all
+ * subpages of huge page to promote, @head point to the head page.
+ *
+ * Only caller must hold pin on the pages on @list, otherwise promotion
+ * fails with -EBUSY. All subpages must be locked.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. *
+ *
+ * Returns 0 if the hugepage is promoted successfully.
+ * Returns -EBUSY if any subpage is pinned or if anon_vma disappeared from
+ * under us.
+ */
+int promote_list_to_huge_page(struct page *head, struct list_head *list)
+{
+ struct anon_vma *anon_vma = NULL;
+ int ret = 0;
+ DECLARE_BITMAP(subpage_bitmap, HPAGE_PMD_NR);
+ struct page *subpage;
+ int i;
+
+ /* no file-backed page support yet */
+ if (PageAnon(head)) {
+ /*
+ * The caller does not necessarily hold an mmap_sem that would
+ * prevent the anon_vma disappearing so we first we take a
+ * reference to it and then lock the anon_vma for write. This
+ * is similar to page_lock_anon_vma_read except the write lock
+ * is taken to serialise against parallel split or collapse
+ * operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+ } else
+ return -EBUSY;
+
+ /* Racy check each subpage to see if any has extra pin */
+ list_for_each_entry(subpage, list, lru) {
+ if (can_promote_huge_page(subpage))
+ bitmap_set(subpage_bitmap, subpage - head, 1);
+ }
+ /* Proceed only if none of subpages has extra pin. */
+ if (!bitmap_full(subpage_bitmap, HPAGE_PMD_NR)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ list_for_each_entry(subpage, list, lru) {
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+ TTU_RMAP_LOCKED;
+ bool unmap_success;
+
+ if (PageAnon(subpage))
+ ttu_flags |= TTU_SPLIT_FREEZE;
+
+ unmap_success = try_to_unmap(subpage, ttu_flags);
+ VM_BUG_ON_PAGE(!unmap_success, subpage);
+ }
+
+ /* Take care of migration wait list:
+ * make compound page first, since it is impossible to move waiting
+ * process from subpage queues to the head page queue.
+ */
+ set_compound_page_dtor(head, COMPOUND_PAGE_DTOR);
+ set_compound_order(head, HPAGE_PMD_ORDER);
+ __SetPageHead(head);
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *p = head + i;
+
+ p->index = 0;
+ p->mapping = TAIL_MAPPING;
+ p->mem_cgroup = NULL;
+ ClearPageActive(p);
+ /* move subpage refcount to head page */
+ page_ref_add(head, page_count(p) - 1);
+ set_page_count(p, 0);
+ set_compound_head(p, head);
+ }
+ atomic_set(compound_mapcount_ptr(head), -1);
+ prep_transhuge_page(head);
+
+ remap_page(head);
+
+ if (!mem_cgroup_disabled())
+ mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, HPAGE_PMD_NR);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ __unlock_page(subpage);
+ }
+
+ INIT_LIST_HEAD(&head->lru);
+ unlock_page(head);
+ putback_lru_page(head);
+
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_cache(head), -HPAGE_PMD_NR);
+out_unlock:
+ if (anon_vma) {
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+ }
+out:
+ return ret;
+}
+
+static int promote_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long haddr,
+ struct page **head, struct list_head *subpage_list)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *pte_ptl;
+ int ret = -EBUSY;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, haddr);
+ pte_ptl = pte_lockptr(mm, pmd);
+
+ spin_lock(pte_ptl);
+ ret = __promote_huge_page_isolate(vma, haddr, pte, head, subpage_list);
+ spin_unlock(pte_ptl);
+
+ if (unlikely(!ret)) {
+ pte_unmap(pte);
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ ret = 0;
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+out_unlock:
+ anon_vma_unlock_write(vma->anon_vma);
+out:
+ return ret;
+}
+
+/* assume mmap_sem is down_write, wrapper for madvise */
+int promote_huge_page_address(struct vm_area_struct *vma, unsigned long haddr)
+{
+ LIST_HEAD(subpage_list);
+ struct page *head;
+
+ if (haddr & (HPAGE_PMD_SIZE - 1))
+ return -EINVAL;
+
+ if (haddr < vma->vm_start || (haddr + HPAGE_PMD_SIZE) > vma->vm_end)
+ return -EINVAL;
+
+ if (promote_huge_page_isolate(vma, haddr, &head, &subpage_list))
+ return -EBUSY;
+
+ return promote_list_to_huge_page(head, &subpage_list);
+}
diff --git a/mm/internal.h b/mm/internal.h
index 70a6ef603e5b..c5e5a0f1cc58 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -581,4 +581,10 @@ int expand_free_page(struct zone *zone, struct page *buddy_head,
void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags);

+void __unlock_page(struct page *page);
+
+int promote_huge_pmd_address(struct vm_area_struct *vma, unsigned long haddr);
+
+int promote_huge_page_address(struct vm_area_struct *vma, unsigned long haddr);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3acfddcba714..ff059353ebc3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -508,7 +508,7 @@ static void release_pte_page(struct page *page)
putback_lru_page(page);
}

-static void release_pte_pages(pte_t *pte, pte_t *_pte)
+void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
--
2.20.1