Re: [PATCH] x86/cpu: use smp_call_function_many() in arch_freq_prepare_all()

From: Eric Dumazet
Date: Wed Mar 23 2022 - 12:06:09 EST


On Fri, Mar 11, 2022 at 8:36 AM Rafael J. Wysocki
<rafael.j.wysocki@xxxxxxxxx> wrote:
>
> On 3/11/2022 2:17 AM, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@xxxxxxxxxx>
> >
> > Opening /proc/cpuinfo can have a big latency on hosts with many cpus,
> > mostly because it is essentially doing:
> >
> > for_each_online_cpu(cpu)
> > smp_call_function_single(cpu, aperfmperf_snapshot_khz, ...)
> >
> > smp_call_function_single() is reusing a common csd, meaning that
> > each invocation needs to wait for completion of the prior one.
> >
> > Paul recent patches have lowered number of cpus receiving the IPI,
> > but there are still cases where the latency of the above loop can
> > reach 10 ms, then an extra msleep(10) is performed, for a total of 20ms.
> >
> > Using smp_call_function_many() allows for full parallelism,
> > and latency is down to ~80 usec, on a host with 256 cpus.
>
> This looks reasonable to me.
>
> Acked-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
>
> or if you want me to pick it up, please resend the patch with a CC to
> linux-pm@xxxxxxxxxxxxxxx.

I do not know what x86 maintainers prefer ?

Let them give their advice here, thanks !

>
> > Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx>
> > Cc: Paul E. McKenney <paulmck@xxxxxxxxxx>
> > Cc: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> > Cc: Borislav Petkov <bp@xxxxxxxxx>
> > Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> > Cc: <x86@xxxxxxxxxx>
> > ---
> > arch/x86/kernel/cpu/aperfmperf.c | 32 +++++++++++++++++++++++---------
> > 1 file changed, 23 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
> > index 22911deacb6e441ad60ddb57190ef3772afb3cf0..a305310ceb44784a0ad9be7c196061d98fa1adbc 100644
> > --- a/arch/x86/kernel/cpu/aperfmperf.c
> > +++ b/arch/x86/kernel/cpu/aperfmperf.c
> > @@ -67,7 +67,8 @@ static void aperfmperf_snapshot_khz(void *dummy)
> > atomic_set_release(&s->scfpending, 0);
> > }
> >
> > -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
> > +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait,
> > + struct cpumask *mask)
> > {
> > s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
> > struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
> > @@ -76,9 +77,13 @@ static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
> > if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
> > return true;
> >
> > - if (!atomic_xchg(&s->scfpending, 1) || wait)
> > - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
> > -
> > + if (!atomic_xchg(&s->scfpending, 1) || wait) {
> > + if (mask)
> > + __cpumask_set_cpu(cpu, mask);
> > + else
> > + smp_call_function_single(cpu, aperfmperf_snapshot_khz,
> > + NULL, wait);
> > + }
> > /* Return false if the previous iteration was too long ago. */
> > return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
> > }
> > @@ -97,13 +102,14 @@ unsigned int aperfmperf_get_khz(int cpu)
> > if (rcu_is_idle_cpu(cpu))
> > return 0; /* Idle CPUs are completely uninteresting. */
> >
> > - aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
> > + aperfmperf_snapshot_cpu(cpu, ktime_get(), true, NULL);
> > return per_cpu(samples.khz, cpu);
> > }
> >
> > void arch_freq_prepare_all(void)
> > {
> > ktime_t now = ktime_get();
> > + cpumask_var_t mask;
> > bool wait = false;
> > int cpu;
> >
> > @@ -113,17 +119,25 @@ void arch_freq_prepare_all(void)
> > if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
> > return;
> >
> > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> > + return;
> > +
> > + cpus_read_lock();
> > for_each_online_cpu(cpu) {
> > if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
> > continue;
> > if (rcu_is_idle_cpu(cpu))
> > continue; /* Idle CPUs are completely uninteresting. */
> > - if (!aperfmperf_snapshot_cpu(cpu, now, false))
> > + if (!aperfmperf_snapshot_cpu(cpu, now, false, mask))
> > wait = true;
> > }
> >
> > - if (wait)
> > - msleep(APERFMPERF_REFRESH_DELAY_MS);
> > + preempt_disable();
> > + smp_call_function_many(mask, aperfmperf_snapshot_khz, NULL, wait);
> > + preempt_enable();
> > + cpus_read_unlock();
> > +
> > + free_cpumask_var(mask);
> > }
> >
> > unsigned int arch_freq_get_on_cpu(int cpu)
> > @@ -139,7 +153,7 @@ unsigned int arch_freq_get_on_cpu(int cpu)
> > if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
> > return 0;
> >
> > - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
> > + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true, NULL))
> > return per_cpu(samples.khz, cpu);
> >
> > msleep(APERFMPERF_REFRESH_DELAY_MS);
>
>