Re: [PATCH RFC v3 5/9] mm/slub: add opt-in percpu array cache of objects

From: Suren Baghdasaryan
Date: Fri Dec 15 2023 - 16:18:25 EST


On Fri, Dec 15, 2023 at 10:28 AM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> On Wed, Nov 29, 2023 at 1:53 AM Vlastimil Babka <vbabka@xxxxxxx> wrote:
> >
> > kmem_cache_setup_percpu_array() will allocate a per-cpu array for
> > caching alloc/free objects of given size for the cache. The cache
> > has to be created with SLAB_NO_MERGE flag.
> >
> > When empty, half of the array is filled by an internal bulk alloc
> > operation. When full, half of the array is flushed by an internal bulk
> > free operation.
> >
> > The array does not distinguish NUMA locality of the cached objects. If
> > an allocation is requested with kmem_cache_alloc_node() with numa node
> > not equal to NUMA_NO_NODE, the array is bypassed.
> >
> > The bulk operations exposed to slab users also try to utilize the array
> > when possible, but leave the array empty or full and use the bulk
> > alloc/free only to finish the operation itself. If kmemcg is enabled and
> > active, bulk freeing skips the array completely as it would be less
> > efficient to use it.
> >
> > The locking scheme is copied from the page allocator's pcplists, based
> > on embedded spin locks. Interrupts are not disabled, only preemption
> > (cpu migration on RT). Trylock is attempted to avoid deadlock due to an
> > interrupt; trylock failure means the array is bypassed.
> >
> > Sysfs stat counters alloc_cpu_cache and free_cpu_cache count objects
> > allocated or freed using the percpu array; counters cpu_cache_refill and
> > cpu_cache_flush count objects refilled or flushed form the array.
> >
> > kmem_cache_prefill_percpu_array() can be called to ensure the array on
> > the current cpu to at least the given number of objects. However this is
> > only opportunistic as there's no cpu pinning between the prefill and
> > usage, and trylocks may fail when the usage is in an irq handler.
> > Therefore allocations cannot rely on the array for success even after
> > the prefill. But misses should be rare enough that e.g. GFP_ATOMIC
> > allocations should be acceptable after the refill.
> >
> > When slub_debug is enabled for a cache with percpu array, the objects in
> > the array are considered as allocated from the slub_debug perspective,
> > and the alloc/free debugging hooks occur when moving the objects between
> > the array and slab pages. This means that e.g. an use-after-free that
> > occurs for an object cached in the array is undetected. Collected
> > alloc/free stacktraces might also be less useful. This limitation could
> > be changed in the future.
> >
> > On the other hand, KASAN, kmemcg and other hooks are executed on actual
> > allocations and frees by kmem_cache users even if those use the array,
> > so their debugging or accounting accuracy should be unaffected.
> >
> > Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx>
> > ---
> > include/linux/slab.h | 4 +
> > include/linux/slub_def.h | 12 ++
> > mm/Kconfig | 1 +
> > mm/slub.c | 457 ++++++++++++++++++++++++++++++++++++++++++++++-
> > 4 files changed, 468 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index d6d6ffeeb9a2..fe0c0981be59 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -197,6 +197,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
> > void kmem_cache_destroy(struct kmem_cache *s);
> > int kmem_cache_shrink(struct kmem_cache *s);
> >
> > +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count);
> > +
> > /*
> > * Please use this macro to create slab caches. Simply specify the
> > * name of the structure and maybe some flags that are listed above.
> > @@ -512,6 +514,8 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
> > void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
> > int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
> >
> > +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, gfp_t gfp);
> > +
> > static __always_inline void kfree_bulk(size_t size, void **p)
> > {
> > kmem_cache_free_bulk(NULL, size, p);
> > diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> > index deb90cf4bffb..2083aa849766 100644
> > --- a/include/linux/slub_def.h
> > +++ b/include/linux/slub_def.h
> > @@ -13,8 +13,10 @@
> > #include <linux/local_lock.h>
> >
> > enum stat_item {
> > + ALLOC_PCA, /* Allocation from percpu array cache */
> > ALLOC_FASTPATH, /* Allocation from cpu slab */
> > ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
> > + FREE_PCA, /* Free to percpu array cache */
> > FREE_FASTPATH, /* Free to cpu slab */
> > FREE_SLOWPATH, /* Freeing not to cpu slab */
> > FREE_FROZEN, /* Freeing to frozen slab */
> > @@ -39,6 +41,8 @@ enum stat_item {
> > CPU_PARTIAL_FREE, /* Refill cpu partial on free */
> > CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
> > CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
> > + PCA_REFILL, /* Refilling empty percpu array cache */
> > + PCA_FLUSH, /* Flushing full percpu array cache */
> > NR_SLUB_STAT_ITEMS
> > };
> >
> > @@ -66,6 +70,13 @@ struct kmem_cache_cpu {
> > };
> > #endif /* CONFIG_SLUB_TINY */
> >
> > +struct slub_percpu_array {
> > + spinlock_t lock;
> > + unsigned int count;
> > + unsigned int used;
> > + void * objects[];
> > +};
> > +
> > #ifdef CONFIG_SLUB_CPU_PARTIAL
> > #define slub_percpu_partial(c) ((c)->partial)
> >
> > @@ -99,6 +110,7 @@ struct kmem_cache {
> > #ifndef CONFIG_SLUB_TINY
> > struct kmem_cache_cpu __percpu *cpu_slab;
> > #endif
> > + struct slub_percpu_array __percpu *cpu_array;
> > /* Used for retrieving partial slabs, etc. */
> > slab_flags_t flags;
> > unsigned long min_partial;
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 89971a894b60..aa53c51bb4a6 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -237,6 +237,7 @@ choice
> > config SLAB_DEPRECATED
> > bool "SLAB (DEPRECATED)"
> > depends on !PREEMPT_RT
> > + depends on BROKEN
> > help
> > Deprecated and scheduled for removal in a few cycles. Replaced by
> > SLUB.
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 59912a376c6d..f08bd71c244f 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -188,6 +188,79 @@ do { \
> > #define USE_LOCKLESS_FAST_PATH() (false)
> > #endif
> >
> > +/* copy/pasted from mm/page_alloc.c */
> > +
> > +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
> > +/*
> > + * On SMP, spin_trylock is sufficient protection.
> > + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
> > + */
> > +#define pcp_trylock_prepare(flags) do { } while (0)
> > +#define pcp_trylock_finish(flag) do { } while (0)
> > +#else
> > +
> > +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
> > +#define pcp_trylock_prepare(flags) local_irq_save(flags)
> > +#define pcp_trylock_finish(flags) local_irq_restore(flags)
> > +#endif
> > +
> > +/*
> > + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
> > + * a migration causing the wrong PCP to be locked and remote memory being
> > + * potentially allocated, pin the task to the CPU for the lookup+lock.
> > + * preempt_disable is used on !RT because it is faster than migrate_disable.
> > + * migrate_disable is used on RT because otherwise RT spinlock usage is
> > + * interfered with and a high priority task cannot preempt the allocator.
> > + */
> > +#ifndef CONFIG_PREEMPT_RT
> > +#define pcpu_task_pin() preempt_disable()
> > +#define pcpu_task_unpin() preempt_enable()
> > +#else
> > +#define pcpu_task_pin() migrate_disable()
> > +#define pcpu_task_unpin() migrate_enable()
> > +#endif
> > +
> > +/*
> > + * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
> > + * Return value should be used with equivalent unlock helper.
> > + */
> > +#define pcpu_spin_lock(type, member, ptr) \
> > +({ \
> > + type *_ret; \
> > + pcpu_task_pin(); \
> > + _ret = this_cpu_ptr(ptr); \
> > + spin_lock(&_ret->member); \
> > + _ret; \
> > +})
> > +
> > +#define pcpu_spin_trylock(type, member, ptr) \
> > +({ \
> > + type *_ret; \
> > + pcpu_task_pin(); \
> > + _ret = this_cpu_ptr(ptr); \
> > + if (!spin_trylock(&_ret->member)) { \
> > + pcpu_task_unpin(); \
> > + _ret = NULL; \
> > + } \
> > + _ret; \
> > +})
> > +
> > +#define pcpu_spin_unlock(member, ptr) \
> > +({ \
> > + spin_unlock(&ptr->member); \
> > + pcpu_task_unpin(); \
> > +})
> > +
> > +/* struct slub_percpu_array specific helpers. */
> > +#define pca_spin_lock(ptr) \
> > + pcpu_spin_lock(struct slub_percpu_array, lock, ptr)
> > +
> > +#define pca_spin_trylock(ptr) \
> > + pcpu_spin_trylock(struct slub_percpu_array, lock, ptr)
> > +
> > +#define pca_spin_unlock(ptr) \
> > + pcpu_spin_unlock(lock, ptr)
> > +
> > #ifndef CONFIG_SLUB_TINY
> > #define __fastpath_inline __always_inline
> > #else
> > @@ -3454,6 +3527,78 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
> > 0, sizeof(void *));
> > }
> >
> > +static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp);
> > +
> > +static __fastpath_inline
> > +void *alloc_from_pca(struct kmem_cache *s, gfp_t gfp)
> > +{
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > + void *object;
> > +
> > +retry:
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > +
> > + if (unlikely(!pca)) {
> > + pcp_trylock_finish(UP_flags);
> > + return NULL;
> > + }
> > +
> > + if (unlikely(pca->used == 0)) {
> > + unsigned int batch = pca->count / 2;
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + if (!gfpflags_allow_blocking(gfp) || in_irq())
> > + return NULL;
> > +
> > + if (refill_pca(s, batch, gfp))
> > + goto retry;
> > +
> > + return NULL;
> > + }
> > +
> > + object = pca->objects[--pca->used];
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + stat(s, ALLOC_PCA);
> > +
> > + return object;
> > +}
> > +
> > +static __fastpath_inline
> > +int alloc_from_pca_bulk(struct kmem_cache *s, size_t size, void **p)
> > +{
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > +
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > +
> > + if (unlikely(!pca)) {
> > + size = 0;
> > + goto failed;
> > + }
> > +
> > + if (pca->used < size)
> > + size = pca->used;
> > +
> > + for (int i = size; i > 0;) {
> > + p[--i] = pca->objects[--pca->used];
> > + }
> > +
> > + pca_spin_unlock(pca);
> > + stat_add(s, ALLOC_PCA, size);
> > +
> > +failed:
> > + pcp_trylock_finish(UP_flags);
> > + return size;
> > +}
> > +
> > /*
> > * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
> > * have the fastpath folded into their functions. So no function call
> > @@ -3479,7 +3624,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
> > if (unlikely(object))
> > goto out;
> >
> > - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> > + if (s->cpu_array && (node == NUMA_NO_NODE))
> > + object = alloc_from_pca(s, gfpflags);
> > +
> > + if (!object)
> > + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
> >
> > maybe_wipe_obj_freeptr(s, object);
> > init = slab_want_init_on_alloc(gfpflags, s);
> > @@ -3726,6 +3875,81 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > discard_slab(s, slab);
> > }
> >
> > +static bool flush_pca(struct kmem_cache *s, unsigned int count);
> > +
> > +static __fastpath_inline
> > +bool free_to_pca(struct kmem_cache *s, void *object)
> > +{
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > +
> > +retry:
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > +
> > + if (!pca) {
> > + pcp_trylock_finish(UP_flags);
> > + return false;
> > + }
> > +
> > + if (pca->used == pca->count) {
> > + unsigned int batch = pca->count / 2;
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + if (in_irq())
> > + return false;
> > +
> > + if (!flush_pca(s, batch))
> > + return false;
> > +
> > + goto retry;
> > + }
> > +
> > + pca->objects[pca->used++] = object;
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + stat(s, FREE_PCA);
> > +
> > + return true;
> > +}
> > +
> > +static __fastpath_inline
> > +size_t free_to_pca_bulk(struct kmem_cache *s, size_t size, void **p)
> > +{
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > + bool init;
> > +
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > +
> > + if (unlikely(!pca)) {
> > + size = 0;
> > + goto failed;
> > + }
> > +
> > + if (pca->count - pca->used < size)
> > + size = pca->count - pca->used;
> > +
> > + init = slab_want_init_on_free(s);
> > +
> > + for (size_t i = 0; i < size; i++) {
> > + if (likely(slab_free_hook(s, p[i], init)))
> > + pca->objects[pca->used++] = p[i];
> > + }
> > +
> > + pca_spin_unlock(pca);
> > + stat_add(s, FREE_PCA, size);
> > +
> > +failed:
> > + pcp_trylock_finish(UP_flags);
> > + return size;
> > +}
> > +
> > #ifndef CONFIG_SLUB_TINY
> > /*
> > * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
> > @@ -3811,7 +4035,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
> > {
> > memcg_slab_free_hook(s, slab, &object, 1);
> >
> > - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
> > + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s))))
> > + return;
> > +
> > + if (s->cpu_array)
> > + free_to_pca(s, object);
>
> free_to_pca() can return false and leave the object alive. I think you
> need to handle the failure case here to avoid leaks.
>
> > + else
> > do_slab_free(s, slab, object, object, 1, addr);
> > }
> >
> > @@ -3956,6 +4185,26 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
> > if (!size)
> > return;
> >
> > + /*
> > + * In case the objects might need memcg_slab_free_hook(), skip the array
> > + * because the hook is not effective with single objects and benefits
> > + * from groups of objects from a single slab that the detached freelist
> > + * builds. But once we build the detached freelist, it's wasteful to
> > + * throw it away and put the objects into the array.
> > + *
> > + * XXX: This test could be cache-specific if it was not possible to use
> > + * __GFP_ACCOUNT with caches that are not SLAB_ACCOUNT
> > + */
> > + if (s && s->cpu_array && !memcg_kmem_online()) {
> > + size_t pca_freed = free_to_pca_bulk(s, size, p);
> > +
> > + if (pca_freed == size)
> > + return;
> > +
> > + p += pca_freed;
> > + size -= pca_freed;
> > + }
> > +
> > do {
> > struct detached_freelist df;
> >
> > @@ -4073,7 +4322,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
> > int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
> > void **p)
> > {
> > - int i;
> > + int from_pca = 0;
> > + int allocated = 0;
> > struct obj_cgroup *objcg = NULL;
> >
> > if (!size)
> > @@ -4084,19 +4334,147 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
> > if (unlikely(!s))
> > return 0;
> >
> > - i = __kmem_cache_alloc_bulk(s, flags, size, p);
> > + if (s->cpu_array)
> > + from_pca = alloc_from_pca_bulk(s, size, p);
> > +
> > + if (from_pca < size) {
> > + allocated = __kmem_cache_alloc_bulk(s, flags, size-from_pca,
> > + p+from_pca);
> > + if (allocated == 0 && from_pca > 0) {
> > + __kmem_cache_free_bulk(s, from_pca, p);
> > + }
> > + }
> > +
> > + allocated += from_pca;
> >
> > /*
> > * memcg and kmem_cache debug support and memory initialization.
> > * Done outside of the IRQ disabled fastpath loop.
> > */
> > - if (i != 0)
> > + if (allocated != 0)
> > slab_post_alloc_hook(s, objcg, flags, size, p,
> > slab_want_init_on_alloc(flags, s), s->object_size);
> > - return i;
> > + return allocated;
> > }
> > EXPORT_SYMBOL(kmem_cache_alloc_bulk);
> >
> > +static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp)
> > +{
> > + void *objects[32];
> > + unsigned int batch, allocated;
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > +
> > +bulk_alloc:
> > + batch = min(count, 32U);
>
> Do you cap each batch at 32 to avoid overshooting too much (same in
> flush_pca())? If so, it would be good to have a comment here. Also,
> maybe this hardcoded 32 should be a function of pca->count instead? If
> we set up a pca array with pca->count larger than 64 then the refill
> count of pca->count/2 will always end up higher than 32, so at the end
> we will have to loop back (goto bulk_alloc) to allocate more objects.

Ah, I just noticed that you are using objects[32] and that's forcing
this limitation. Please ignore my previous comment.

>
> > +
> > + allocated = __kmem_cache_alloc_bulk(s, gfp, batch, &objects[0]);
> > + if (!allocated)
> > + return false;
> > +
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > + if (!pca) {
> > + pcp_trylock_finish(UP_flags);
> > + return false;
> > + }
> > +
> > + batch = min(allocated, pca->count - pca->used);
> > +
> > + for (unsigned int i = 0; i < batch; i++) {
> > + pca->objects[pca->used++] = objects[i];
> > + }
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + stat_add(s, PCA_REFILL, batch);
> > +
> > + /*
> > + * We could have migrated to a different cpu or somebody else freed to the
> > + * pca while we were bulk allocating, and now we have too many objects
> > + */
> > + if (batch < allocated) {
> > + __kmem_cache_free_bulk(s, allocated - batch, &objects[batch]);
> > + } else {
> > + count -= batch;
> > + if (count > 0)
> > + goto bulk_alloc;
> > + }
> > +
> > + return true;
> > +}
> > +
> > +static bool flush_pca(struct kmem_cache *s, unsigned int count)
> > +{
> > + void *objects[32];
> > + unsigned int batch, remaining;
> > + unsigned long __maybe_unused UP_flags;
> > + struct slub_percpu_array *pca;
> > +
> > +next_batch:
> > + batch = min(count, 32);
> > +
> > + pcp_trylock_prepare(UP_flags);
> > + pca = pca_spin_trylock(s->cpu_array);
> > + if (!pca) {
> > + pcp_trylock_finish(UP_flags);
> > + return false;
> > + }
> > +
> > + batch = min(batch, pca->used);
> > +
> > + for (unsigned int i = 0; i < batch; i++) {
> > + objects[i] = pca->objects[--pca->used];
> > + }
> > +
> > + remaining = pca->used;
> > +
> > + pca_spin_unlock(pca);
> > + pcp_trylock_finish(UP_flags);
> > +
> > + __kmem_cache_free_bulk(s, batch, &objects[0]);
> > +
> > + stat_add(s, PCA_FLUSH, batch);
> > +
> > + if (batch < count && remaining > 0) {
> > + count -= batch;
> > + goto next_batch;
> > + }
> > +
> > + return true;
> > +}
> > +
> > +/* Do not call from irq handler nor with irqs disabled */
> > +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
> > + gfp_t gfp)
> > +{
> > + struct slub_percpu_array *pca;
> > + unsigned int used;
> > +
> > + lockdep_assert_no_hardirq();
> > +
> > + if (!s->cpu_array)
> > + return -EINVAL;
> > +
> > + /* racy but we don't care */
> > + pca = raw_cpu_ptr(s->cpu_array);
> > +
> > + used = READ_ONCE(pca->used);
> > +
> > + if (used >= count)
> > + return 0;
> > +
> > + if (pca->count < count)
> > + return -EINVAL;
> > +
> > + count -= used;
> > +
> > + if (!refill_pca(s, count, gfp))
> > + return -ENOMEM;
> > +
> > + return 0;
> > +}
> >
> > /*
> > * Object placement in a slab is made very easy because we always start at
> > @@ -5167,6 +5545,65 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
> > return 0;
> > }
> >
> > +/**
> > + * kmem_cache_setup_percpu_array - Create a per-cpu array cache for the cache
> > + * @s: The cache to add per-cpu array. Must be created with SLAB_NO_MERGE flag.
> > + * @count: Size of the per-cpu array.
> > + *
> > + * After this call, allocations from the cache go through a percpu array. When
> > + * it becomes empty, half is refilled with a bulk allocation. When it becomes
> > + * full, half is flushed with a bulk free operation.
> > + *
> > + * Using the array cache is not guaranteed, i.e. it can be bypassed if its lock
> > + * cannot be obtained. The array cache also does not distinguish NUMA nodes, so
> > + * allocations via kmem_cache_alloc_node() with a node specified other than
> > + * NUMA_NO_NODE will bypass the cache.
> > + *
> > + * Bulk allocation and free operations also try to use the array.
> > + *
> > + * kmem_cache_prefill_percpu_array() can be used to pre-fill the array cache
> > + * before e.g. entering a restricted context. It is however not guaranteed that
> > + * the caller will be able to subsequently consume the prefilled cache. Such
> > + * failures should be however sufficiently rare so after the prefill,
> > + * allocations using GFP_ATOMIC | __GFP_NOFAIL are acceptable for objects up to
> > + * the prefilled amount.
> > + *
> > + * Limitations: when slub_debug is enabled for the cache, all relevant actions
> > + * (i.e. poisoning, obtaining stacktraces) and checks happen when objects move
> > + * between the array cache and slab pages, which may result in e.g. not
> > + * detecting a use-after-free while the object is in the array cache, and the
> > + * stacktraces may be less useful.
> > + *
> > + * Return: 0 if OK, -EINVAL on caches without SLAB_NO_MERGE or with the array
> > + * already created, -ENOMEM when the per-cpu array creation fails.
> > + */
> > +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count)
> > +{
> > + int cpu;
> > +
> > + if (WARN_ON_ONCE(!(s->flags & SLAB_NO_MERGE)))
> > + return -EINVAL;
> > +
> > + if (s->cpu_array)
> > + return -EINVAL;
> > +
> > + s->cpu_array = __alloc_percpu(struct_size(s->cpu_array, objects, count),
> > + sizeof(void *));
>
> Maybe I missed it, but where do you free s->cpu_array? I see
> __kmem_cache_release() freeing s->cpu_slab but s->cpu_array seems to
> be left alive...
>
> > +
> > + if (!s->cpu_array)
> > + return -ENOMEM;
> > +
> > + for_each_possible_cpu(cpu) {
> > + struct slub_percpu_array *pca = per_cpu_ptr(s->cpu_array, cpu);
> > +
> > + spin_lock_init(&pca->lock);
> > + pca->count = count;
> > + pca->used = 0;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > #ifdef SLAB_SUPPORTS_SYSFS
> > static int count_inuse(struct slab *slab)
> > {
> > @@ -5944,8 +6381,10 @@ static ssize_t text##_store(struct kmem_cache *s, \
> > } \
> > SLAB_ATTR(text); \
> >
> > +STAT_ATTR(ALLOC_PCA, alloc_cpu_cache);
> > STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
> > STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
> > +STAT_ATTR(FREE_PCA, free_cpu_cache);
> > STAT_ATTR(FREE_FASTPATH, free_fastpath);
> > STAT_ATTR(FREE_SLOWPATH, free_slowpath);
> > STAT_ATTR(FREE_FROZEN, free_frozen);
> > @@ -5970,6 +6409,8 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
> > STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
> > STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
> > STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
> > +STAT_ATTR(PCA_REFILL, cpu_cache_refill);
> > +STAT_ATTR(PCA_FLUSH, cpu_cache_flush);
> > #endif /* CONFIG_SLUB_STATS */
> >
> > #ifdef CONFIG_KFENCE
> > @@ -6031,8 +6472,10 @@ static struct attribute *slab_attrs[] = {
> > &remote_node_defrag_ratio_attr.attr,
> > #endif
> > #ifdef CONFIG_SLUB_STATS
> > + &alloc_cpu_cache_attr.attr,
> > &alloc_fastpath_attr.attr,
> > &alloc_slowpath_attr.attr,
> > + &free_cpu_cache_attr.attr,
> > &free_fastpath_attr.attr,
> > &free_slowpath_attr.attr,
> > &free_frozen_attr.attr,
> > @@ -6057,6 +6500,8 @@ static struct attribute *slab_attrs[] = {
> > &cpu_partial_free_attr.attr,
> > &cpu_partial_node_attr.attr,
> > &cpu_partial_drain_attr.attr,
> > + &cpu_cache_refill_attr.attr,
> > + &cpu_cache_flush_attr.attr,
> > #endif
> > #ifdef CONFIG_FAILSLAB
> > &failslab_attr.attr,
> >
> > --
> > 2.43.0
> >
> >