[PATCH v2 20/21] mm/slub: optimize alloc fastpath code layout

From: Vlastimil Babka
Date: Mon Nov 20 2023 - 13:36:04 EST


With allocation fastpaths no longer divided between two .c files, we
have better inlining, however checking the disassembly of
kmem_cache_alloc() reveals we can do better to make the fastpaths
smaller and move the less common situations out of line or to separate
functions, to reduce instruction cache pressure.

- split memcg pre/post alloc hooks to inlined checks that use likely()
to assume there will be no objcg handling necessary, and non-inline
functions doing the actual handling

- add some more likely/unlikely() to pre/post alloc hooks to indicate
which scenarios should be out of line

- change gfp_allowed_mask handling in slab_post_alloc_hook() so the
code can be optimized away when kasan/kmsan/kmemleak is configured out

bloat-o-meter shows:
add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403)
Function old new delta
__memcg_slab_post_alloc_hook - 461 +461
kmem_cache_alloc_bulk 775 791 +16
__pfx_should_failslab.constprop - 16 +16
__pfx___memcg_slab_post_alloc_hook - 16 +16
should_failslab.constprop - 12 +12
__pfx_memcg_slab_post_alloc_hook 16 - -16
kmem_cache_alloc_lru 1295 1023 -272
kmem_cache_alloc_node 1118 817 -301
kmem_cache_alloc 1076 772 -304
kmalloc_node_trace 1149 838 -311
kmalloc_trace 1102 789 -313
__kmalloc_node_track_caller 1393 1080 -313
__kmalloc_node 1397 1082 -315
__kmalloc 1374 1059 -315
memcg_slab_post_alloc_hook 464 - -464

Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the
code is out of line. Forcing noinline did not improve the results. As a
result the fastpaths are shorter and overal code size is reduced.

Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx>
---
mm/slub.c | 89 ++++++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 5683f1d02e4f..77d259f3d592 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1866,25 +1866,17 @@ static inline size_t obj_full_size(struct kmem_cache *s)
/*
* Returns false if the allocation should fail.
*/
-static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- struct list_lru *lru,
- struct obj_cgroup **objcgp,
- size_t objects, gfp_t flags)
+static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
{
- struct obj_cgroup *objcg;
-
- if (!memcg_kmem_online())
- return true;
-
- if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
- return true;
-
/*
* The obtained objcg pointer is safe to use within the current scope,
* defined by current task or set_active_memcg() pair.
* obj_cgroup_get() is used to get a permanent reference.
*/
- objcg = current_obj_cgroup();
+ struct obj_cgroup *objcg = current_obj_cgroup();
if (!objcg)
return true;

@@ -1907,17 +1899,34 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
return true;
}

-static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size,
- void **p)
+/*
+ * Returns false if the allocation should fail.
+ */
+static __fastpath_inline
+bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
+ struct obj_cgroup **objcgp, size_t objects,
+ gfp_t flags)
+{
+ if (!memcg_kmem_online())
+ return true;
+
+ if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
+ return true;
+
+ return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects,
+ flags));
+}
+
+static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
{
struct slab *slab;
unsigned long off;
size_t i;

- if (!memcg_kmem_online() || !objcg)
- return;
+ flags &= gfp_allowed_mask;

for (i = 0; i < size; i++) {
if (likely(p[i])) {
@@ -1940,6 +1949,16 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
}
}

+static __fastpath_inline
+void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p)
+{
+ if (likely(!memcg_kmem_online() || !objcg))
+ return;
+
+ return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+}
+
static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
@@ -3709,34 +3728,34 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
}
ALLOW_ERROR_INJECTION(should_failslab, ERRNO);

-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- struct list_lru *lru,
- struct obj_cgroup **objcgp,
- size_t size, gfp_t flags)
+static __fastpath_inline
+struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
+ struct obj_cgroup **objcgp,
+ size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;

might_alloc(flags);

- if (should_failslab(s, flags))
+ if (unlikely(should_failslab(s, flags)))
return NULL;

- if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
+ if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
return NULL;

return s;
}

-static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg, gfp_t flags,
- size_t size, void **p, bool init,
- unsigned int orig_size)
+static __fastpath_inline
+void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p, bool init,
+ unsigned int orig_size)
{
unsigned int zero_size = s->object_size;
bool kasan_init = init;
size_t i;
-
- flags &= gfp_allowed_mask;
+ gfp_t init_flags = flags & gfp_allowed_mask;

/*
* For kmalloc object, the allocated memory size(object_size) is likely
@@ -3769,13 +3788,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
* As p[i] might get tagged, memset and kmemleak hook come after KASAN.
*/
for (i = 0; i < size; i++) {
- p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init);
+ p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
if (p[i] && init && (!kasan_init ||
!kasan_has_integrated_init()))
memset(p[i], 0, zero_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
- s->flags, flags);
- kmsan_slab_alloc(s, p[i], flags);
+ s->flags, init_flags);
+ kmsan_slab_alloc(s, p[i], init_flags);
}

memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
@@ -3799,7 +3818,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
bool init = false;

s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
- if (!s)
+ if (unlikely(!s))
return NULL;

object = kfence_alloc(s, orig_size, gfpflags);

--
2.42.1