[PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

From: Marcelo Tosatti
Date: Tue Dec 06 2022 - 12:20:59 EST


For nohz full CPUs, manage per-CPU stat syncing from CPU context:
start delayed work when marking per-CPU vmstat dirty.

When returning to userspace, fold the stats and cancel the delayed work.

When entering idle, only fold the stats.

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
---
include/linux/vmstat.h | 4 ++--
kernel/time/tick-sched.c | 2 +-
mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++---------
3 files changed, 35 insertions(+), 12 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/tick.h>

#include "internal.h"

@@ -195,9 +196,24 @@ void fold_vm_numa_events(void)

#ifdef CONFIG_SMP
static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;

static inline void vmstat_mark_dirty(void)
{
+ int cpu = smp_processor_id();
+
+ if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
+ struct delayed_work *dw;
+
+ dw = &per_cpu(vmstat_work, cpu);
+ if (!delayed_work_pending(dw)) {
+ unsigned long delay;
+
+ delay = round_jiffies_relative(sysctl_stat_interval);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+ }
+ }
this_cpu_write(vmstat_dirty, true);
}

@@ -1886,9 +1902,6 @@ static const struct seq_operations vmsta
#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
{
@@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
{
+ struct delayed_work *dw;
+
if (system_state != SYSTEM_RUNNING)
return;

if (!is_vmstat_dirty())
return;

+ refresh_cpu_vm_stats(false);
+
+ if (!user)
+ return;
/*
- * Just refresh counters and do not care about the pending delayed
- * vmstat_update. It doesn't fire that often to matter and canceling
- * it would be too expensive from this path.
- * vmstat_shepherd will take care about that for us.
+ * If the tick is stopped, cancel any delayed work to avoid
+ * interruptions to this CPU in the future.
*/
- refresh_cpu_vm_stats(false);
+ dw = &per_cpu(vmstat_work, smp_processor_id());
+ if (delayed_work_pending(dw))
+ cancel_delayed_work(dw);
}

/*
@@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);

+ /* NOHZ full CPUs manage their own vmstat flushing */
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ continue;
+
if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);

Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone *
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

@@ -403,7 +403,7 @@ static inline void __dec_node_page_state

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }

static inline void drain_zonestat(struct zone *zone,
struct per_cpu_zonestat *pzstats) { }
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- quiet_vmstat();
+ quiet_vmstat(false);

ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;