Re: [rfc 04/45] cpu alloc: Use in SLUB

From: Mathieu Desnoyers
Date: Tue Nov 20 2007 - 07:42:21 EST


* clameter@xxxxxxx (clameter@xxxxxxx) wrote:
> Using cpu alloc removes the needs for the per cpu arrays in the kmem_cache struct.
> These could get quite big if we have to support system of up to thousands of cpus.
> The use of alloc_percpu means that:
>
> 1. The size of kmem_cache for SMP configuration shrinks since we will only
> need 1 pointer instead of NR_CPUS. The same pointer can be used by all
> processors. Reduces cache footprint of the allocator.
>
> 2. We can dynamically size kmem_cache according to the actual nodes in the
> system meaning less memory overhead for configurations that may potentially
> support up to 1k NUMA nodes.
>
> 3. We can remove the diddle widdle with allocating and releasing kmem_cache_cpu
> structures when bringing up and shuttting down cpus. The allocpercpu
> logic will do it all for us. Removes some portions of the cpu hotplug
> functionality.
>
> 4. Fastpath performance increases by another 20% vs. the earlier improvements.
> Instead of having fastpath with 45-50 cycles it is now possible to get
> below 40.
>
> Remove the CONFIG_FAST_CMPXCHG version since this patch makes SLUB use CPU ops
> instead.
>
> Signed-off-by: Christoph Lameter <clameter@xxxxxxx>
> ---
> arch/x86/Kconfig | 4
> include/linux/slub_def.h | 6 -
> mm/slub.c | 229 ++++++++++-------------------------------------
> 3 files changed, 52 insertions(+), 187 deletions(-)
>
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h 2007-11-19 15:45:08.270140279 -0800
> +++ linux-2.6/include/linux/slub_def.h 2007-11-19 15:53:25.869890760 -0800
> @@ -34,6 +34,7 @@ struct kmem_cache_node {
> * Slab cache management.
> */
> struct kmem_cache {
> + struct kmem_cache_cpu *cpu_slab;
> /* Used for retriving partial slabs etc */
> unsigned long flags;
> int size; /* The size of an object including meta data */
> @@ -63,11 +64,6 @@ struct kmem_cache {
> int defrag_ratio;
> struct kmem_cache_node *node[MAX_NUMNODES];
> #endif
> -#ifdef CONFIG_SMP
> - struct kmem_cache_cpu *cpu_slab[NR_CPUS];
> -#else
> - struct kmem_cache_cpu cpu_slab;
> -#endif
> };
>
> /*
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c 2007-11-19 15:45:08.278140252 -0800
> +++ linux-2.6/mm/slub.c 2007-11-19 15:54:10.513640214 -0800
> @@ -239,15 +239,6 @@ static inline struct kmem_cache_node *ge
> #endif
> }
>
> -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
> -{
> -#ifdef CONFIG_SMP
> - return s->cpu_slab[cpu];
> -#else
> - return &s->cpu_slab;
> -#endif
> -}
> -
> /*
> * The end pointer in a slab is special. It points to the first object in the
> * slab but has bit 0 set to mark it.
> @@ -1472,7 +1463,7 @@ static inline void flush_slab(struct kme
> */
> static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
> {
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> + struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>
> if (likely(c && c->page))
> flush_slab(s, c);
> @@ -1487,15 +1478,7 @@ static void flush_cpu_slab(void *d)
>
> static void flush_all(struct kmem_cache *s)
> {
> -#ifdef CONFIG_SMP
> on_each_cpu(flush_cpu_slab, s, 1, 1);
> -#else
> - unsigned long flags;
> -
> - local_irq_save(flags);
> - flush_cpu_slab(s);
> - local_irq_restore(flags);
> -#endif
> }
>

Normally :

You can't use on_each_cpu if interrupts are already disabled. Therefore,
the implementation using "local_irq_disable/enable" in smp.h for the UP
case is semantically correct and there is no need to use a save/restore.
So just using on_each_cpu should be enough here.

I also wonder about flush_cpu_slab execution on _other_ cpus. I am not
convinced interrupts are disabled when it executes.. have I missing
something ?


> /*
> @@ -1511,6 +1494,15 @@ static inline int node_match(struct kmem
> return 1;
> }
>
> +static inline int cpu_node_match(struct kmem_cache_cpu *c, int node)
> +{
> +#ifdef CONFIG_NUMA
> + if (node != -1 && __CPU_READ(c->node) != node)
> + return 0;
> +#endif
> + return 1;
> +}
> +
> /* Allocate a new slab and make it the current cpu slab */
> static noinline unsigned long get_new_slab(struct kmem_cache *s,
> struct kmem_cache_cpu **pc, gfp_t gfpflags, int node)
> @@ -1529,7 +1521,7 @@ static noinline unsigned long get_new_sl
> if (!page)
> return 0;
>
> - *pc = c = get_cpu_slab(s, smp_processor_id());
> + *pc = c = THIS_CPU(s->cpu_slab);

I think the preferred coding style is :

c = THIS_CPU(s->cpu_slab);
*pc = c;

> if (c->page)
> flush_slab(s, c);
> c->page = page;
> @@ -1554,16 +1546,18 @@ static noinline unsigned long get_new_sl
> * we need to allocate a new slab. This is slowest path since we may sleep.
> */
> static void *__slab_alloc(struct kmem_cache *s,
> - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
> + gfp_t gfpflags, int node, void *addr)
> {
> void **object;
> unsigned long state;
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> + struct kmem_cache_cpu *c;
> +#ifdef CONFIG_FAST_CPU_OPS
> unsigned long flags;
>
> local_irq_save(flags);
> preempt_enable_no_resched();
> #endif
> + c = THIS_CPU(s->cpu_slab);
> if (likely(c->page)) {
> state = slab_lock(c->page);
>
> @@ -1597,7 +1591,7 @@ load_freelist:
> unlock_out:
> slab_unlock(c->page, state);
> out:
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
> preempt_disable();
> local_irq_restore(flags);
> #endif
> @@ -1640,26 +1634,24 @@ static void __always_inline *slab_alloc(
> void **object;
> struct kmem_cache_cpu *c;
>
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> - c = get_cpu_slab(s, get_cpu());
> +#ifdef CONFIG_FAST_CPU_OPS

I wonder.. are there some architectures that would provide fast
cmpxchg_local but not fast cpu ops ?

> + c = s->cpu_slab;
> do {
> - object = c->freelist;
> - if (unlikely(is_end(object) || !node_match(c, node))) {
> - object = __slab_alloc(s, gfpflags, node, addr, c);
> - if (unlikely(!object)) {
> - put_cpu();
> + object = __CPU_READ(c->freelist);
> + if (unlikely(is_end(object) ||
> + !cpu_node_match(c, node))) {
> + object = __slab_alloc(s, gfpflags, node, addr);
> + if (unlikely(!object))
> goto out;
> - }
> break;
> }
> - } while (cmpxchg_local(&c->freelist, object, object[c->offset])
> - != object);
> - put_cpu();
> + } while (CPU_CMPXCHG(c->freelist, object,
> + object[__CPU_READ(c->offset)]) != object);

Hrm. What happens here if we call __slab_alloc, get a valid object, then
have a CPU_CMPXCHG that fails, restart the loop.. is this case taken
care of or do we end up having an unreferenced object ? Maybe there is
some logic in freelist that takes care of it ?

Also, we have to be aware that we can now change CPU between the

__CPU_READ and the CPU_CMPXCHG. (also : should it be a __CPU_CMPXCHG ?)

But since "object" contains information specific to the local CPU, the
cmpxchg should fail if we are migrated and everything should be ok.

Hrm, actually, the

c = s->cpu_slab;

should probably be after the object = __CPU_READ(c->freelist); ?

The cpu_read acts as a safeguard checking that we do not change CPU
between the read and the cmpxchg. If we are preempted between the "c"
read and the cpu_read, we could do a !cpu_node_match(c, node) check that
would apply to the wrong cpu.


> #else
> unsigned long flags;
>
> local_irq_save(flags);
> - c = get_cpu_slab(s, smp_processor_id());
> + c = THIS_CPU(s->cpu_slab);
> if (unlikely((is_end(c->freelist)) || !node_match(c, node))) {
>
> object = __slab_alloc(s, gfpflags, node, addr, c);
> @@ -1709,7 +1701,7 @@ static void __slab_free(struct kmem_cach
> void **object = (void *)x;
> unsigned long state;
>
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
> unsigned long flags;
>
> local_irq_save(flags);
> @@ -1739,7 +1731,7 @@ checks_ok:
>
> out_unlock:
> slab_unlock(page, state);
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
> local_irq_restore(flags);
> #endif
> return;
> @@ -1752,7 +1744,7 @@ slab_empty:
> remove_partial(s, page);
>
> slab_unlock(page, state);
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
> local_irq_restore(flags);
> #endif
> discard_slab(s, page);
> @@ -1781,13 +1773,13 @@ static void __always_inline slab_free(st
> void **object = (void *)x;
> struct kmem_cache_cpu *c;
>
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
> void **freelist;
>
> - c = get_cpu_slab(s, get_cpu());
> + c = s->cpu_slab;
> debug_check_no_locks_freed(object, s->objsize);
> do {
> - freelist = c->freelist;
> + freelist = __CPU_READ(c->freelist);

Same here, c = s->cpu_slab; should probably be read after.

> barrier();
> /*
> * If the compiler would reorder the retrieval of c->page to
> @@ -1800,19 +1792,19 @@ static void __always_inline slab_free(st
> * then any change of cpu_slab will cause the cmpxchg to fail
> * since the freelist pointers are unique per slab.
> */
> - if (unlikely(page != c->page || c->node < 0)) {
> - __slab_free(s, page, x, addr, c->offset);
> + if (unlikely(page != __CPU_READ(c->page) ||
> + __CPU_READ(c->node) < 0)) {
> + __slab_free(s, page, x, addr, __CPU_READ(c->offset));

And same question as above : what happens if we fail after executing the
__slab_free.. is it valid to do it twice ?

> break;
> }
> - object[c->offset] = freelist;
> - } while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
> - put_cpu();
> + object[__CPU_READ(c->offset)] = freelist;
> + } while (CPU_CMPXCHG(c->freelist, freelist, object) != freelist);
> #else
> unsigned long flags;
>
> local_irq_save(flags);
> debug_check_no_locks_freed(object, s->objsize);
> - c = get_cpu_slab(s, smp_processor_id());
> + c = THIS_CPU(s->cpu_slab);
> if (likely(page == c->page && c->node >= 0)) {
> object[c->offset] = c->freelist;
> c->freelist = object;
> @@ -2015,130 +2007,19 @@ static void init_kmem_cache_node(struct
> #endif
> }
>
> -#ifdef CONFIG_SMP
> -/*
> - * Per cpu array for per cpu structures.
> - *
> - * The per cpu array places all kmem_cache_cpu structures from one processor
> - * close together meaning that it becomes possible that multiple per cpu
> - * structures are contained in one cacheline. This may be particularly
> - * beneficial for the kmalloc caches.
> - *
> - * A desktop system typically has around 60-80 slabs. With 100 here we are
> - * likely able to get per cpu structures for all caches from the array defined
> - * here. We must be able to cover all kmalloc caches during bootstrap.
> - *
> - * If the per cpu array is exhausted then fall back to kmalloc
> - * of individual cachelines. No sharing is possible then.
> - */
> -#define NR_KMEM_CACHE_CPU 100
> -
> -static DEFINE_PER_CPU(struct kmem_cache_cpu,
> - kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
> -
> -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
> -static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
> -
> -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
> - int cpu, gfp_t flags)
> -{
> - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
> -
> - if (c)
> - per_cpu(kmem_cache_cpu_free, cpu) =
> - (void *)c->freelist;
> - else {
> - /* Table overflow: So allocate ourselves */
> - c = kmalloc_node(
> - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
> - flags, cpu_to_node(cpu));
> - if (!c)
> - return NULL;
> - }
> -
> - init_kmem_cache_cpu(s, c);
> - return c;
> -}
> -
> -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
> -{
> - if (c < per_cpu(kmem_cache_cpu, cpu) ||
> - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
> - kfree(c);
> - return;
> - }
> - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
> - per_cpu(kmem_cache_cpu_free, cpu) = c;
> -}
> -
> -static void free_kmem_cache_cpus(struct kmem_cache *s)
> -{
> - int cpu;
> -
> - for_each_online_cpu(cpu) {
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> -
> - if (c) {
> - s->cpu_slab[cpu] = NULL;
> - free_kmem_cache_cpu(c, cpu);
> - }
> - }
> -}
> -
> static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> {
> int cpu;
>
> - for_each_online_cpu(cpu) {
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> + s->cpu_slab = CPU_ALLOC(struct kmem_cache_cpu, flags);
>
> - if (c)
> - continue;
> -
> - c = alloc_kmem_cache_cpu(s, cpu, flags);
> - if (!c) {
> - free_kmem_cache_cpus(s);
> - return 0;
> - }
> - s->cpu_slab[cpu] = c;
> - }
> - return 1;
> -}
> -
> -/*
> - * Initialize the per cpu array.
> - */
> -static void init_alloc_cpu_cpu(int cpu)
> -{
> - int i;
> -
> - if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
> - return;
> -
> - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
> - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
> -
> - cpu_set(cpu, kmem_cach_cpu_free_init_once);
> -}
> -
> -static void __init init_alloc_cpu(void)
> -{
> - int cpu;
> + if (!s->cpu_slab)
> + return 0;
>
> for_each_online_cpu(cpu)
> - init_alloc_cpu_cpu(cpu);
> - }
> -
> -#else
> -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
> -static inline void init_alloc_cpu(void) {}
> -
> -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> -{
> - init_kmem_cache_cpu(s, &s->cpu_slab);
> + init_kmem_cache_cpu(s, CPU_PTR(s->cpu_slab, cpu));
> return 1;
> }
> -#endif
>
> #ifdef CONFIG_NUMA
> /*
> @@ -2452,9 +2333,8 @@ static inline int kmem_cache_close(struc
> int node;
>
> flush_all(s);
> -
> + CPU_FREE(s->cpu_slab);
> /* Attempt to free all objects */
> - free_kmem_cache_cpus(s);
> for_each_node_state(node, N_NORMAL_MEMORY) {
> struct kmem_cache_node *n = get_node(s, node);
>
> @@ -2958,8 +2838,6 @@ void __init kmem_cache_init(void)
> int i;
> int caches = 0;
>
> - init_alloc_cpu();
> -
> #ifdef CONFIG_NUMA
> /*
> * Must first have the slab cache available for the allocations of the
> @@ -3019,11 +2897,12 @@ void __init kmem_cache_init(void)
> for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
> kmalloc_caches[i]. name =
> kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
> -
> #ifdef CONFIG_SMP
> register_cpu_notifier(&slab_notifier);
> - kmem_size = offsetof(struct kmem_cache, cpu_slab) +
> - nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
> +#endif
> +#ifdef CONFIG_NUMA
> + kmem_size = offsetof(struct kmem_cache, node) +
> + nr_node_ids * sizeof(struct kmem_cache_node *);
> #else
> kmem_size = sizeof(struct kmem_cache);
> #endif
> @@ -3120,7 +2999,7 @@ struct kmem_cache *kmem_cache_create(con
> * per cpu structures
> */
> for_each_online_cpu(cpu)
> - get_cpu_slab(s, cpu)->objsize = s->objsize;
> + CPU_PTR(s->cpu_slab, cpu)->objsize = s->objsize;
> s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
> up_write(&slub_lock);
> if (sysfs_slab_alias(s, name))
> @@ -3165,11 +3044,9 @@ static int __cpuinit slab_cpuup_callback
> switch (action) {
> case CPU_UP_PREPARE:
> case CPU_UP_PREPARE_FROZEN:
> - init_alloc_cpu_cpu(cpu);
> down_read(&slub_lock);
> list_for_each_entry(s, &slab_caches, list)
> - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
> - GFP_KERNEL);
> + init_kmem_cache_cpu(s, __CPU_PTR(s->cpu_slab, cpu));
> up_read(&slub_lock);
> break;
>
> @@ -3179,13 +3056,9 @@ static int __cpuinit slab_cpuup_callback
> case CPU_DEAD_FROZEN:
> down_read(&slub_lock);
> list_for_each_entry(s, &slab_caches, list) {
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> -
> local_irq_save(flags);
> __flush_cpu_slab(s, cpu);
> local_irq_restore(flags);
> - free_kmem_cache_cpu(c, cpu);
> - s->cpu_slab[cpu] = NULL;
> }
> up_read(&slub_lock);
> break;
> @@ -3657,7 +3530,7 @@ static unsigned long slab_objects(struct
> for_each_possible_cpu(cpu) {
> struct page *page;
> int node;
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> + struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>
> if (!c)
> continue;
> @@ -3724,7 +3597,7 @@ static int any_slab_objects(struct kmem_
> int cpu;
>
> for_each_possible_cpu(cpu) {
> - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> + struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>
> if (c && c->page)
> return 1;
> Index: linux-2.6/arch/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/arch/x86/Kconfig 2007-11-19 15:53:55.529390403 -0800
> +++ linux-2.6/arch/x86/Kconfig 2007-11-19 15:54:10.509139813 -0800
> @@ -112,10 +112,6 @@ config GENERIC_TIME_VSYSCALL
> bool
> default X86_64
>
> -config FAST_CMPXCHG_LOCAL
> - bool
> - default y
> -
> config ZONE_DMA32
> bool
> default X86_64
>
> --

--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/