Re: numa/core regressions fixed - more testers wanted

From: Hugh Dickins
Date: Tue Nov 20 2012 - 23:10:19 EST


On Tue, 20 Nov 2012, Rik van Riel wrote:
> On 11/20/2012 08:54 PM, Andrew Theurer wrote:
>
> > I can confirm single JVM JBB is working well for me. I see a 30%
> > improvement over autoNUMA. What I can't make sense of is some perf
> > stats (taken at 80 warehouses on 4 x WST-EX, 512GB memory):
>
> AutoNUMA does not have native THP migration, that may explain some
> of the difference.

When I made some fixes to the sched/numa native THP migration,
I did also try porting that (with Hannes's memcg fixes) to AutoNUMA.

Here's the patch below: it appeared to be working just fine, but
you might find that it doesn't quite apply to whatever tree you're
using. I started from 3.6 autonuma28fast in aa.git, but had folded
in some of the equally applicable TLB flush optimizations too.

There's also a little "Hack, remove after THP native migration"
retuning in mm/huge_memory.c which should probably be removed too.

No signoffs, but it's from work by Peter and Ingo and Hannes and Hugh.
---

include/linux/huge_mm.h | 4 -
mm/autonuma.c | 59 +++++-----------
mm/huge_memory.c | 140 +++++++++++++++++++++++++++++++++-----
mm/internal.h | 5 -
mm/memcontrol.c | 7 +
mm/memory.c | 4 -
mm/migrate.c | 2
7 files changed, 158 insertions(+), 63 deletions(-)

--- 306aa/include/linux/huge_mm.h 2012-11-04 10:21:30.965548793 -0800
+++ 306AA/include/linux/huge_mm.h 2012-11-04 17:14:32.232651475 -0800
@@ -11,8 +11,8 @@ extern int copy_huge_pmd(struct mm_struc
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd);
-extern int huge_pmd_numa_fixup(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd, pmd_t *pmdp);
+extern int huge_pmd_numa_fixup(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd);
extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
unsigned long addr,
--- 306aa/mm/autonuma.c 2012-11-04 10:22:13.993549816 -0800
+++ 306AA/mm/autonuma.c 2012-11-04 21:24:43.045622669 -0800
@@ -77,8 +77,7 @@ void autonuma_migrate_split_huge_page(st

static int sync_isolate_migratepages(struct list_head *migratepages,
struct page *page,
- struct pglist_data *pgdat,
- bool *migrated)
+ struct pglist_data *pgdat)
{
struct zone *zone;
struct lruvec *lruvec;
@@ -129,19 +128,9 @@ static int sync_isolate_migratepages(str
* __isolate_lru_page successd, the page could be freed and
* reallocated out from under us. Thus our previous checks on
* the page, and the split_huge_page, would be worthless.
- *
- * We really only need to do this if "ret > 0" but it doesn't
- * hurt to do it unconditionally as nobody can reference
- * "page" anymore after this and so we can avoid an "if (ret >
- * 0)" branch here.
*/
- put_page(page);
- /*
- * Tell the caller we already released its pin, to avoid a
- * double free.
- */
- *migrated = true;
-
+ if (ret > 0)
+ put_page(page);
out:
return ret;
}
@@ -215,13 +204,12 @@ static inline void autonuma_migrate_unlo
spin_unlock(&NODE_DATA(nid)->autonuma_migrate_lock);
}

-static bool autonuma_migrate_page(struct page *page, int dst_nid,
- int page_nid, bool *migrated)
+static bool autonuma_migrate_page(struct page *page, int dst_nid, int page_nid,
+ int nr_pages)
{
int isolated = 0;
LIST_HEAD(migratepages);
struct pglist_data *pgdat = NODE_DATA(dst_nid);
- int nr_pages = hpage_nr_pages(page);
unsigned long autonuma_migrate_nr_pages = 0;

autonuma_migrate_lock(dst_nid);
@@ -242,11 +230,12 @@ static bool autonuma_migrate_page(struct
autonuma_printk("migrated %lu pages to node %d\n",
autonuma_migrate_nr_pages, dst_nid);

- if (autonuma_balance_pgdat(pgdat, nr_pages))
+ if (autonuma_balance_pgdat(pgdat, nr_pages)) {
+ if (nr_pages > 1)
+ return true;
isolated = sync_isolate_migratepages(&migratepages,
- page, pgdat,
- migrated);
-
+ page, pgdat);
+ }
if (isolated) {
int err;
trace_numa_migratepages_begin(current, &migratepages,
@@ -381,15 +370,14 @@ static bool should_migrate_page(struct t
static int numa_hinting_fault_memory_follow_cpu(struct task_struct *p,
struct page *page,
int this_nid, int page_nid,
- bool *migrated)
+ int numpages)
{
if (!should_migrate_page(p, page, this_nid, page_nid))
goto out;
if (!PageLRU(page))
goto out;
if (this_nid != page_nid) {
- if (autonuma_migrate_page(page, this_nid, page_nid,
- migrated))
+ if (autonuma_migrate_page(page, this_nid, page_nid, numpages))
return this_nid;
}
out:
@@ -418,14 +406,17 @@ bool numa_hinting_fault(struct page *pag
VM_BUG_ON(this_nid < 0);
VM_BUG_ON(this_nid >= MAX_NUMNODES);
access_nid = numa_hinting_fault_memory_follow_cpu(p, page,
- this_nid,
- page_nid,
- &migrated);
- /* "page" has been already freed if "migrated" is true */
+ this_nid, page_nid, numpages);
numa_hinting_fault_cpu_follow_memory(p, access_nid, numpages);
+ migrated = access_nid != page_nid;
}

- return migrated;
+ /* small page was already freed if migrated */
+ if (!migrated) {
+ put_page(page);
+ return false;
+ }
+ return true;
}

/* NUMA hinting page fault entry point for ptes */
@@ -434,7 +425,6 @@ int pte_numa_fixup(struct mm_struct *mm,
{
struct page *page;
spinlock_t *ptl;
- bool migrated;

/*
* The "pte" at this point cannot be used safely without
@@ -455,9 +445,7 @@ int pte_numa_fixup(struct mm_struct *mm,
get_page(page);
pte_unmap_unlock(ptep, ptl);

- migrated = numa_hinting_fault(page, 1);
- if (!migrated)
- put_page(page);
+ numa_hinting_fault(page, 1);
out:
return 0;

@@ -476,7 +464,6 @@ int pmd_numa_fixup(struct mm_struct *mm,
spinlock_t *ptl;
bool numa = false;
struct vm_area_struct *vma;
- bool migrated;

spin_lock(&mm->page_table_lock);
pmd = *pmdp;
@@ -521,9 +508,7 @@ int pmd_numa_fixup(struct mm_struct *mm,
get_page(page);
pte_unmap_unlock(pte, ptl);

- migrated = numa_hinting_fault(page, 1);
- if (!migrated)
- put_page(page);
+ numa_hinting_fault(page, 1);

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
--- 306aa/mm/huge_memory.c 2012-11-04 15:32:28.512793096 -0800
+++ 306AA/mm/huge_memory.c 2012-11-04 22:21:14.112450390 -0800
@@ -17,6 +17,7 @@
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/migrate.h>
#include <linux/autonuma.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -1037,35 +1038,140 @@ out:

#ifdef CONFIG_AUTONUMA
/* NUMA hinting page fault entry point for trans huge pmds */
-int huge_pmd_numa_fixup(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd, pmd_t *pmdp)
+int huge_pmd_numa_fixup(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t entry)
{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct mem_cgroup *memcg = NULL;
+ struct page *new_page;
struct page *page;
- bool migrated;

spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ if (unlikely(!pmd_same(entry, *pmd)))
+ goto unlock;

- page = pmd_page(pmd);
- pmd = pmd_mknonnuma(pmd);
- set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmdp, pmd);
- VM_BUG_ON(pmd_numa(*pmdp));
- if (unlikely(page_mapcount(page) != 1))
- goto out_unlock;
+ page = pmd_page(entry);
+ /*
+ * Do not migrate this page if it is mapped anywhere else.
+ * Do not migrate this page if its count has been raised.
+ * Our caller's down_read of mmap_sem excludes fork raising
+ * mapcount; but recheck page count below whenever we take
+ * page_table_lock - although it's unclear what pin we are
+ * protecting against, since get_user_pages() or GUP fast
+ * would have to fault it present before they could proceed.
+ */
+ if (unlikely(page_count(page) != 1))
+ goto fixup;
get_page(page);
spin_unlock(&mm->page_table_lock);

- migrated = numa_hinting_fault(page, HPAGE_PMD_NR);
- if (!migrated)
- put_page(page);
+ if (numa_hinting_fault(page, HPAGE_PMD_NR))
+ goto migrate;

-out:
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(entry, *pmd)))
+ goto unlock;
+fixup:
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ VM_BUG_ON(pmd_numa(*pmd));
+ update_mmu_cache(vma, address, entry);
+
+unlock:
+ spin_unlock(&mm->page_table_lock);
return 0;

-out_unlock:
+migrate:
+ lock_page(page);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+ }
spin_unlock(&mm->page_table_lock);
- goto out;
+
+ new_page = alloc_pages_node(numa_node_id(),
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page)
+ goto alloc_fail;
+
+ if (isolate_lru_page(page)) { /* Does an implicit get_page() */
+ put_page(new_page);
+ goto alloc_fail;
+ }
+
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+
+ migrate_page_copy(new_page, page);
+
+ WARN_ON(PageLRU(new_page));
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 3)) {
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ unlock_page(page);
+ putback_lru_page(page);
+ put_page(page); /* Drop the local reference */
+ return 0;
+ }
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+ put_page(page); /* Drop the local reference */
+ return 0;
+
+alloc_fail:
+ unlock_page(page);
+ put_page(page);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry)))
+ goto unlock;
+ goto fixup;
}
#endif

--- 306aa/mm/internal.h 2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/internal.h 2012-11-04 16:16:21.760439246 -0800
@@ -216,11 +216,12 @@ static inline void mlock_migrate_page(st
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);

local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
- __inc_zone_page_state(newpage, NR_MLOCK);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
--- 306aa/mm/memcontrol.c 2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/memcontrol.c 2012-11-04 16:15:55.264437693 -0800
@@ -3261,15 +3261,18 @@ void mem_cgroup_prepare_migration(struct
struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;

*memcgp = NULL;

- VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return;

+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
@@ -3331,7 +3334,7 @@ void mem_cgroup_prepare_migration(struct
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
}

/* remove redundant charge if migration failed*/
--- 306aa/mm/memory.c 2012-11-04 10:21:34.181548869 -0800
+++ 306AA/mm/memory.c 2012-11-04 17:06:11.400620099 -0800
@@ -3546,8 +3546,8 @@ retry:
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (pmd_numa(*pmd))
- return huge_pmd_numa_fixup(mm, address,
- orig_pmd, pmd);
+ return huge_pmd_numa_fixup(mm, vma, address,
+ pmd, orig_pmd);
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
!pmd_trans_splitting(orig_pmd)) {
--- 306aa/mm/migrate.c 2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/migrate.c 2012-11-04 17:10:13.084633509 -0800
@@ -407,7 +407,7 @@ int migrate_huge_page_move_mapping(struc
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/