Re: [PATCH 1/2] pcpcntr: add group allocation/free

From: Dennis Zhou
Date: Tue Aug 22 2023 - 13:02:25 EST


Hi Mateusz,

On Mon, Aug 21, 2023 at 10:28:28PM +0200, Mateusz Guzik wrote:
> Allocations and frees are globally serialized on the pcpu lock (and the
> CPU hotplug lock if enabled, which is the case on Debian).
>
> At least one frequent consumer allocates 4 back-to-back counters (and
> frees them in the same manner), exacerbating the problem.
>
> While this does not fully remedy scalability issues, it is a step
> towards that goal and provides immediate relief.
>
> Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx>

As I mentioned yesterday, I like this approach because instead of making
percpu smarter, we're batching against the higher level inits which I'm
assuming will have additional synchronization requirements.

Below is mostly style changes to conform and a few nits wrt naming.

> ---
> include/linux/percpu_counter.h | 19 ++++++++---
> lib/percpu_counter.c | 61 ++++++++++++++++++++++++----------
> 2 files changed, 57 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 75b73c83bc9d..ff5850b07124 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -30,17 +30,26 @@ struct percpu_counter {
>
> extern int percpu_counter_batch;
>
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> - struct lock_class_key *key);
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> + struct lock_class_key *key, u32 count);

1. Can we move count to before the lock_class_key?
2. count is an overloaded term between percpu_counters and
percpu_refcount. Maybe `nr` or `nr_counters` is better?

>
> -#define percpu_counter_init(fbc, value, gfp) \
> +#define percpu_counter_init_many(fbc, value, gfp, count) \
> ({ \
> static struct lock_class_key __key; \
> \
> - __percpu_counter_init(fbc, value, gfp, &__key); \
> + __percpu_counter_init_many(fbc, value, gfp, &__key, count);\
> })
>
> -void percpu_counter_destroy(struct percpu_counter *fbc);
> +
> +#define percpu_counter_init(fbc, value, gfp) \
> + percpu_counter_init_many(fbc, value, gfp, 1)
> +
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count);
> +static inline void percpu_counter_destroy(struct percpu_counter *fbc)
> +{
> + percpu_counter_destroy_many(fbc, 1);
> +}
> +
> void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
> void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
> s32 batch);
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 5004463c4f9f..2a33cf23df55 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -151,48 +151,73 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
> }
> EXPORT_SYMBOL(__percpu_counter_sum);
>
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> - struct lock_class_key *key)
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> + struct lock_class_key *key, u32 count)
> {
> unsigned long flags __maybe_unused;
> + s32 __percpu *counters;
> + u32 i;
>
> - raw_spin_lock_init(&fbc->lock);
> - lockdep_set_class(&fbc->lock, key);
> - fbc->count = amount;
> - fbc->counters = alloc_percpu_gfp(s32, gfp);
> - if (!fbc->counters)
> + counters = __alloc_percpu_gfp(sizeof(*counters) * count,
> + sizeof(*counters), gfp);


So while a bit moot, I think it'd be nice to clarify what we should do
here wrt alignment and batch allocation. There has been a case in the
past where alignment > size.

This is from my batch api implementation:
+ /*
+ * When allocating a batch we need to align the size so that each
+ * element in the batch will have the appropriate alignment.
+ */
+ size = ALIGN(size, align);

So I think some variation of:
element_size = ALIGN(sizeof(*counters, __alignof__(*counters)));
counters = __alloc_percpu_gfp(nr * element_size, __alignof__(*counters), gfp);

While this isn't necessary here for s32's, I think it would be nice to
just set the precedent so we preserve alignment asks for future users if
they use this as a template.

> + if (!counters) {
> + fbc[0].counters = NULL;
> return -ENOMEM;
> + }
>
> - debug_percpu_counter_activate(fbc);
> + for (i = 0; i < count; i++) {
> + raw_spin_lock_init(&fbc[i].lock);
> + lockdep_set_class(&fbc[i].lock, key);
> +#ifdef CONFIG_HOTPLUG_CPU
> + INIT_LIST_HEAD(&fbc[i].list);
> +#endif
> + fbc[i].count = amount;
> + fbc[i].counters = &counters[i];

This would have to become some (void *) math tho with element_size.

> +
> + debug_percpu_counter_activate(&fbc[i]);
> + }
>
> #ifdef CONFIG_HOTPLUG_CPU
> - INIT_LIST_HEAD(&fbc->list);
> spin_lock_irqsave(&percpu_counters_lock, flags);
> - list_add(&fbc->list, &percpu_counters);
> + for (i = 0; i < count; i++) {
> + list_add(&fbc[i].list, &percpu_counters);
> + }

nit: we don't add {} for single line loops.

> spin_unlock_irqrestore(&percpu_counters_lock, flags);
> #endif
> return 0;
> }
> -EXPORT_SYMBOL(__percpu_counter_init);
> +EXPORT_SYMBOL(__percpu_counter_init_many);
>
> -void percpu_counter_destroy(struct percpu_counter *fbc)
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count)
> {
> unsigned long flags __maybe_unused;
> + u32 i;
>
> - if (!fbc->counters)
> + if (WARN_ON_ONCE(!fbc))
> return;
>
> - debug_percpu_counter_deactivate(fbc);
> + if (!fbc[0].counters)
> + return;
> +
> + for (i = 0; i < count; i++) {
> + debug_percpu_counter_deactivate(&fbc[i]);
> + }
>

nit: no {}.

> #ifdef CONFIG_HOTPLUG_CPU
> spin_lock_irqsave(&percpu_counters_lock, flags);
> - list_del(&fbc->list);
> + for (i = 0; i < count; i++) {
> + list_del(&fbc[i].list);
> + }

nit: no {}.

> spin_unlock_irqrestore(&percpu_counters_lock, flags);
> #endif
> - free_percpu(fbc->counters);
> - fbc->counters = NULL;
> +
> + free_percpu(fbc[0].counters);
> +
> + for (i = 0; i < count; i++) {
> + fbc[i].counters = NULL;
> + }

nit: no {}.

> }
> -EXPORT_SYMBOL(percpu_counter_destroy);
> +EXPORT_SYMBOL(percpu_counter_destroy_many);
>
> int percpu_counter_batch __read_mostly = 32;
> EXPORT_SYMBOL(percpu_counter_batch);
> --
> 2.39.2
>

Thanks,
Dennis