[PATCH v2 2/3] mm: Update NUMA counter threshold size

From: Kemi Wang
Date: Thu Aug 24 2017 - 06:01:42 EST


There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2,
as a small threshold greatly increases the update frequency of the global
counter from local per cpu counter(suggested by Ying Huang).

The rationality is that these statistics counters don't affect the kernel's
decision, unlike other VM counters, so it's not a problem to use a large
threshold.

With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 10000000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

Threshold CPU cycles Throughput(88 threads)
32 799 241760478
64 640 301628829
125 537 358906028 <==> system by default (base)
256 468 412397590
512 428 450550704
4096 399 482520943
20000 394 489009617
30000 395 488017817
65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset
N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Changelog:
v2:
a) Change the type of vm_numa_stat_diff[] from s16 to u16, since numa
stats counter is always a incremental field.
b) Remove numa_stat_threshold field in struct per_cpu_pageset, since it
is a constant value and rarely be changed.
c) Cut down instructions in __inc_numa_state() due to the incremental
numa counter and the consistant numa threshold.
d) Move zone_numa_state_snapshot() to an individual patch, since it
does not appear to be related to this patch.

Signed-off-by: Kemi Wang <kemi.wang@xxxxxxxxx>
Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxx>
Suggested-by: Ying Huang <ying.huang@xxxxxxxxx>
---
include/linux/mmzone.h | 3 +--
mm/vmstat.c | 28 ++++++++++------------------
2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 582f6d9..c386ec4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,7 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
- s8 numa_stat_threshold;
- s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+ u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b54b..b015f39 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@

#include "internal.h"

+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void)

per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
- per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
- = threshold;
-#endif
+
/* Base nodestat threshold on the largest populated zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;

threshold = (*calculate_pressure)(zone);
- for_each_online_cpu(cpu) {
+ for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
- per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
- = threshold;
-#endif
- }
}
}

@@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone,
enum numa_stat_item item)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
- s8 __percpu *p = pcp->vm_numa_stat_diff + item;
- s8 v, t;
+ u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+ u16 v;

v = __this_cpu_inc_return(*p);
- t = __this_cpu_read(pcp->numa_stat_threshold);
- if (unlikely(v > t)) {
- s8 overstep = t >> 1;

- zone_numa_state_add(v + overstep, zone, item);
- __this_cpu_write(*p, -overstep);
+ if (unlikely(v > NUMA_STATS_THRESHOLD)) {
+ zone_numa_state_add(v, zone, item);
+ __this_cpu_write(*p, 0);
}
}

@@ -1796,7 +1788,7 @@ static bool need_update(int cpu)

BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
#ifdef CONFIG_NUMA
- BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+ BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
#endif
/*
* The fast way of checking if there are any vmstat diffs.
--
2.7.4