Re: [PATCH v11 3/6] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

From: Frederic Weisbecker
Date: Fri Dec 23 2022 - 09:42:00 EST


On Wed, Dec 21, 2022 at 01:58:04PM -0300, Marcelo Tosatti wrote:
> @@ -194,21 +195,50 @@ void fold_vm_numa_events(void)
> #endif
>
> #ifdef CONFIG_SMP
> -static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> +
> +struct vmstat_dirty {
> + bool dirty;
> + bool cpuhotplug;

May be call it "online" for clarity. Also should it depend on CONFIG_FLUSH_WORK_ON_RESUME_USER?

> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct vmstat_dirty, vmstat_dirty_pcpu);
> +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> +int sysctl_stat_interval __read_mostly = HZ;
>
> static inline void vmstat_mark_dirty(void)
> {
> - this_cpu_write(vmstat_dirty, true);
> + struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

Please avoid ifdeffery in the middle of a function when possible.
This block could be in a different function or use IS_ENABLED()
for example.

> + int cpu = smp_processor_id();
> +
> + if (tick_nohz_full_cpu(cpu) && !vms->dirty) {
> + struct delayed_work *dw;
> +
> + dw = this_cpu_ptr(&vmstat_work);
> + if (!delayed_work_pending(dw) && !vms->cpuhotplug) {
> + unsigned long delay;
> +
> + delay = round_jiffies_relative(sysctl_stat_interval);
> + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> + }
> + }
> +#endif
> + vms->dirty = true;
> }
>
> static inline void vmstat_clear_dirty(void)
> {
> - this_cpu_write(vmstat_dirty, false);
> + struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> + vms->dirty = false;

You could keep this_cpu_write(vmstat_dirty.dirty, false)

> }
>
> static inline bool is_vmstat_dirty(void)
> {
> - return this_cpu_read(vmstat_dirty);
> + struct vmstat_dirty *vms = this_cpu_ptr(&vmstat_dirty_pcpu);
> +
> + return vms->dirty;

Ditto with this_cpu_read()?

> }
>
> int calculate_pressure_threshold(struct zone *zone)
> @@ -1981,13 +2008,18 @@ void quiet_vmstat(void)
> if (!is_vmstat_dirty())
> return;
>
> + refresh_cpu_vm_stats(false);
> +
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

This can use IS_ENABLED()

> + if (!user)
> + return;
> /*
> - * Just refresh counters and do not care about the pending delayed
> - * vmstat_update. It doesn't fire that often to matter and canceling
> - * it would be too expensive from this path.
> - * vmstat_shepherd will take care about that for us.
> + * If the tick is stopped, cancel any delayed work to avoid
> + * interruptions to this CPU in the future.
> */
> - refresh_cpu_vm_stats(false);
> + if (delayed_work_pending(this_cpu_ptr(&vmstat_work)))
> + cancel_delayed_work(this_cpu_ptr(&vmstat_work));
> +#endif
> }
>
> /*
> @@ -2008,8 +2040,15 @@ static void vmstat_shepherd(struct work_
> /* Check processors whose vmstat worker threads have been disabled */
> for_each_online_cpu(cpu) {
> struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> + struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
>
> - if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
> +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER

Same here.

> + /* NOHZ full CPUs manage their own vmstat flushing */
> + if (tick_nohz_full_cpu(cpu))
> + continue;
> +#endif
> +
> + if (!delayed_work_pending(dw) && vms->dirty)
> queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
>
> cond_resched();
> @@ -2053,8 +2111,15 @@ static int vmstat_cpu_online(unsigned in
> return 0;
> }
>
> +/*
> + * ONLINE: The callbacks are invoked on the hotplugged CPU from the per CPU
> + * hotplug thread with interrupts and preemption enabled.

This is OFFLINE and the reason behind that comment is confusing.

> + */
> static int vmstat_cpu_down_prep(unsigned int cpu)
> {
> + struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
> +
> + vms->cpuhotplug = true;

this_cpu_write() ?

> cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
> return 0;
> }
> +config FLUSH_WORK_ON_RESUME_USER
> + bool "Flush per-CPU vmstats on user return (for nohz full CPUs)"
> + depends on NO_HZ_FULL
> + default y
> +
> + help
> + By default, nohz full CPUs flush per-CPU vm statistics on return
> + to userspace (to avoid additional interferences when executing
> + userspace code). This has a small but measurable impact on
> + system call performance. You can disable this to improve system call
> + performance, at the expense of potential interferences to userspace
> + execution.

Can you move that below config CPU_ISOLATION ?

Thanks!

> +
> # multi-gen LRU {
> config LRU_GEN
> bool "Multi-Gen LRU"
>
>