[RFC PATCH 3/3] Change THP behavior

From: Alex Thorlton
Date: Thu Dec 12 2013 - 13:01:08 EST


This patch implements the functionality we're really going for here.
It adds the decision making behavior to determine when to grab a
temporary compound page, and whether or not to fault in single pages or
to turn the temporary page into a THP. This one is rather large, might
split it up a bit more for later versions

I've left most of my comments in here just to provide people with some
insight into what I may have been thinking when I chose to do something
in a certain way. These will probably be trimmed down in later versions
of the patch.

Signed-off-by: Alex Thorlton <athorlton@xxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Nate Zimmer <nzimmer@xxxxxxx>
Cc: Cliff Wickman <cpw@xxxxxxx>
Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Wanpeng Li <liwanp@xxxxxxxxxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Michel Lespinasse <walken@xxxxxxxxxx>
Cc: Benjamin LaHaise <bcrl@xxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Zhang Yanfei <zhangyanfei@xxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxx>
Cc: Jiang Liu <jiang.liu@xxxxxxxxxx>
Cc: Cody P Schafer <cody@xxxxxxxxxxxxxxxxxx>
Cc: Glauber Costa <glommer@xxxxxxxxxxxxx>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx

---
include/linux/huge_mm.h | 6 +
include/linux/mm_types.h | 13 +++
kernel/fork.c | 1 +
mm/huge_memory.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
mm/internal.h | 1 +
mm/memory.c | 29 ++++-
mm/page_alloc.c | 66 ++++++++++-
7 files changed, 392 insertions(+), 7 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0943b1b6..c1e407d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -5,6 +5,12 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags);
+extern struct temp_hugepage *find_pmd_mm_freelist(struct mm_struct *mm,
+ pmd_t *pmd);
+extern int do_huge_pmd_temp_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b5efa23..d48c6ab 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -322,6 +322,17 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct temp_hugepage {
+ pmd_t *pmd;
+ struct page *page;
+ spinlock_t temp_hugepage_lock;
+ int node; /* node id of the first page in the chunk */
+ int ref_count; /* number of pages faulted in from the chunk */
+ struct list_head list; /* pointers to next/prev chunks */
+};
+#endif
+
struct kioctx_table;
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
@@ -408,7 +419,9 @@ struct mm_struct {
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+ spinlock_t thp_list_lock; /* lock to protect thp_temp_list */
int thp_threshold;
+ struct list_head thp_temp_list; /* list of 512 page chunks for THPs */
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73..a3ccf857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -816,6 +816,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
+ INIT_LIST_HEAD(&mm->thp_temp_list);
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->first_nid = NUMA_PTE_SCAN_INIT;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d388e4..43ea095 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -788,6 +788,20 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}

+static inline gfp_t alloc_temp_hugepage_gfpmask(gfp_t extra_gfp)
+{
+ return GFP_TEMP_TRANSHUGE | extra_gfp;
+}
+
+static inline struct page *alloc_temp_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
+{
+ return alloc_pages_vma(alloc_temp_hugepage_gfpmask(extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+
#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
@@ -871,6 +885,275 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}

+/* We need to hold mm->thp_list_lock during this search */
+struct temp_hugepage *find_pmd_mm_freelist(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct temp_hugepage *temp_thp;
+ /*
+ * we need to check to make sure that the PMD isn't already
+ * on the list. return the temp_hugepage struct if we find one
+ * otherwise we just return NULL
+ */
+ list_for_each_entry(temp_thp, &mm->thp_temp_list, list) {
+ if (temp_thp->pmd == pmd) {
+ return temp_thp;
+ }
+ }
+
+ return NULL;
+}
+
+int do_huge_pmd_temp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ int i;
+ spinlock_t *ptl;
+ struct page *page;
+ pte_t *pte;
+ pte_t entry;
+ struct temp_hugepage *temp_thp;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ /*
+ * we're not going to handle this case yet, for now
+ * we'll just fall back to regular pages
+ */
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ struct page *zero_page;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_page);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
+
+ /*
+ * Here's where we either need to store the PMD on the list
+ * and give them a regular page, or make the decision to flip
+ * the PSE bit and send them back with a hugepage
+ *
+ * + First we call find_pmd_mm_freelist to determine if the pmd
+ * we're interested in has already been faulted into
+ */
+ spin_lock(&mm->thp_list_lock);
+ temp_thp = find_pmd_mm_freelist(mm, pmd);
+
+ /* this is a temporary workaround to avoid putting the pages back on the freelist */
+ if (temp_thp && temp_thp->node == -1) {
+ spin_unlock(&mm->thp_list_lock);
+ goto single_fault;
+ }
+
+ /*
+ * we need to create a list entry and add it to the
+ * new per-mm free list if we didn't find an existing
+ * entry
+ */
+ if (!temp_thp && pmd_none(*pmd)) {
+ /* try to get 512 pages from the freelist */
+ page = alloc_temp_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+
+ if (unlikely(!page)) {
+ /* we should probably change the VM event here? */
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ /* do this here instead of below, to get the whole page ready */
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+
+ /* add a new temp_hugepage entry to the local freelist */
+ temp_thp = kmalloc(sizeof(struct temp_hugepage), GFP_KERNEL);
+ if (!temp_thp)
+ return VM_FAULT_OOM;
+ temp_thp->pmd = pmd;
+ temp_thp->page = page;
+ temp_thp->node = numa_node_id();
+ temp_thp->ref_count = 1;
+ list_add(&temp_thp->list, &mm->thp_temp_list);
+ /*
+ * otherwise we increment the reference count, and decide whether
+ * or not to create a THP
+ */
+ } else if (temp_thp && !pmd_none(*pmd) && temp_thp->node == numa_node_id()) {
+ temp_thp->ref_count++;
+ /* if they allocated from a different node, they don't get a thp */
+ } else if (temp_thp && !pmd_none(*pmd) && temp_thp->node != numa_node_id()) {
+ /*
+ * for now we handle this by pushing the rest of the faults through our
+ * custom fault code below, eventually we will want to put the unused
+ * pages from out temp_hugepage back on the freelist, so they can be
+ * faulted in by the normal code paths
+ */
+
+ temp_thp->node = -1;
+ } else {
+ spin_unlock(&mm->thp_list_lock);
+ return VM_FAULT_FALLBACK;
+ }
+
+ spin_unlock(&mm->thp_list_lock);
+
+ /*
+ * now that we've done the accounting work, we check to see if
+ * we've exceeded our threshold
+ */
+ if (temp_thp->ref_count >= mm->thp_threshold) {
+ pmd_t pmd_entry;
+ pgtable_t pgtable;
+
+ /*
+ * we'll do all of the following beneath the big ptl for now
+ * this will need to be modified to work with the split ptl
+ */
+ spin_lock(&mm->page_table_lock);
+
+ /*
+ * once we get past the lock we have to make sure that somebody
+ * else hasn't already turned this guy into a THP, if they have,
+ * then the page we need is already faulted in as part of the THP
+ * they created
+ */
+ if (PageTransHuge(temp_thp->page)) {
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+ }
+
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable)) {
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_OOM;
+ }
+
+ /* might wanna move this? */
+ __SetPageUptodate(temp_thp->page);
+
+ /* turn the pages into one compound page */
+ make_compound_page(temp_thp->page, HPAGE_PMD_ORDER);
+
+ /* set up the pmd */
+ pmd_entry = mk_huge_pmd(temp_thp->page, vma->vm_page_prot);
+ pmd_entry = maybe_pmd_mkwrite(pmd_mkdirty(pmd_entry), vma);
+
+ /* remap the new page since we cleared the mappings */
+ page_add_anon_rmap(temp_thp->page, vma, address);
+
+ /* deposit the thp */
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+
+ set_pmd_at(mm, haddr, pmd, pmd_entry);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR - mm->thp_threshold + 1);
+ /* mm->nr_ptes++; */
+
+ /* delete the reference to this compound page from our list */
+ spin_lock(&mm->thp_list_lock);
+ list_del(&temp_thp->list);
+ spin_unlock(&mm->thp_list_lock);
+
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+ } else {
+single_fault:
+ /* fault in the page */
+ if (pmd_none(*pmd) && __pte_alloc(mm, vma, temp_thp->pmd, address))
+ return VM_FAULT_OOM;
+
+ /*
+ * we'll do all of the following beneath the big ptl for now
+ * this will need to be modified to work with the split ptl
+ */
+ spin_lock(&mm->page_table_lock);
+
+ page = temp_thp->page + (int) pte_index(address);
+
+ /* set the page's refcount */
+ set_page_refcounted(page);
+ pte = pte_offset_map(temp_thp->pmd, address);
+
+ /* might wanna move this? */
+ __SetPageUptodate(page);
+
+ if (!pte_present(*pte)) {
+ if (pte_none(*pte)) {
+ pte_unmap(pte);
+
+ if (unlikely(anon_vma_prepare(vma))) {
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_OOM;
+ }
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+
+ pte = pte_offset_map_lock(mm, temp_thp->pmd, address, &ptl);
+
+ page_add_new_anon_rmap(page, vma, haddr);
+ add_mm_counter(mm, MM_ANONPAGES, 1);
+
+ set_pte_at(mm, address, pte, entry);
+
+ pte_unmap_unlock(pte, ptl);
+ spin_unlock(&mm->page_table_lock);
+
+ return 0;
+ }
+ } else {
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_FALLBACK;
+ }
+ }
+
+ /* I don't know what this does right now. I'm leaving it */
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ /*
+ * here's the important piece, where we actually make our 512
+ * page chunk into a THP, by setting the PSE bit. This is the
+ * spot we really need to change. In the end, we could probably
+ * spin this up into the old function, but we'll keep them separate
+ * for now
+ */
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ /* again, probably want a different VM event here */
+ count_vm_event(THP_FAULT_ALLOC);
+ return 0;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa..8fc296b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -98,6 +98,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
*/
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
+extern void make_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
diff --git a/mm/memory.c b/mm/memory.c
index d176154..014d9ba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3764,13 +3764,30 @@ retry:
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (transparent_hugepage_enabled(vma)) {
int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ /*
+ * This is a temporary location for this code, just to get things
+ * up and running. I'll come up with a better way to handle this
+ * later
+ */
+ if (!mm->thp_threshold)
+ mm->thp_threshold = thp_threshold_check();
+ if (!mm->thp_temp_list.next && !mm->thp_temp_list.prev)
+ INIT_LIST_HEAD(&mm->thp_temp_list);
+ if (mm->thp_threshold > 1) {
+ if (!vma->vm_ops)
+ ret = do_huge_pmd_temp_page(mm, vma, address,
+ pmd, flags);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else if (pmd_none(*pmd)) {
+ if (!vma->vm_ops)
+ ret = do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
} else {
pmd_t orig_pmd = *pmd;
int ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fa..48e13fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -375,6 +375,65 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}

+/*
+ * This function is used to create a proper compound page from a chunk of
+ * contiguous pages, most likely allocated as a temporary hugepage
+ */
+void make_compound_page(struct page *page, unsigned long order)
+{
+ int i, max_count = 0, max_mapcount = 0;
+ int nr_pages = 1 << order;
+
+ set_compound_page_dtor(page, free_compound_page);
+ set_compound_order(page, order);
+
+ __SetPageHead(page);
+
+ /*
+ * we clear all the mappings here, so we have to remember to set
+ * them back up!
+ */
+ page->mapping = NULL;
+
+ max_count = (int) atomic_read(&page->_count);
+ max_mapcount = (int) atomic_read(&page->_mapcount);
+
+ for (i = 1; i < nr_pages; i++) {
+ int cur_count, cur_mapcount;
+ struct page *p = page + i;
+ p->flags = 0; /* this seems dumb */
+ __SetPageTail(p);
+ set_page_count(p, 0);
+ p->first_page = page;
+ p->mapping = NULL;
+
+ cur_count = (int) atomic_read(&page->_count);
+ cur_mapcount = (int) atomic_read(&page->_mapcount);
+ atomic_set(&page->_count, 0);
+ atomic_set(&page->_mapcount, -1);
+ if (cur_count > max_count)
+ max_count = cur_count;
+ if (cur_mapcount > max_mapcount)
+ max_mapcount = cur_mapcount;
+
+ /*
+ * poison the LRU entries for all the tail pages (aside from the
+ * first one), the entries for the head page should be okay
+ */
+ if (i != 1) {
+ p->lru.next = LIST_POISON1;
+ p->lru.prev = LIST_POISON2;
+ }
+ }
+
+ atomic_set(&page->_count, max_count);
+ /*
+ * we set to max_mapcount - 1 here because we're going to
+ * map this page again later. This definitely doesn't seem right.
+ */
+ atomic_set(&page->_mapcount, max_mapcount - 1);
+}
+
/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
@@ -865,7 +924,12 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
}

set_page_private(page, 0);
- set_page_refcounted(page);
+ /*
+ * We don't want to set _count for temporary compound pages, since
+ * we may not immediately fault in the first page
+ */
+ if (!(gfp_flags & __GFP_COMP_TEMP))
+ set_page_refcounted(page);

arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
--
1.7.12.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/