Re: [RFC v2 00/34] SLUB: reduce irq disabled scope and make it RT compatible

From: Mike Galbraith
Date: Sat Jul 03 2021 - 11:48:07 EST


On Sat, 2021-07-03 at 09:24 +0200, Mike Galbraith wrote:
>
> It also appears to be saying that there's something RT specific to
> stare at in addition to the list_lock business.

The what is ___slab_alloc() consuming 3.9% CPU in tip-rt-slub whereas
it consumes < 1% in both tip-rt (sans slub patches) and tip-slub.

The why remains to ponder.

5.13.0.g60ab3ed-tip-rt 5.13.0.g60ab3ed-tip-rt-slub 5.13.0.g60ab3ed-tip-slub
25.18% copy_user_enhanced_fast_string copy_user_enhanced_fast_string copy_user_enhanced_fast_string
5.08% unix_stream_read_generic unix_stream_read_generic unix_stream_read_generic
3.39% rt_spin_lock *** ___slab_alloc *** __skb_datagram_iter
2.80% __skb_datagram_iter rt_spin_lock _raw_spin_lock
2.11% get_page_from_freelist __skb_datagram_iter __alloc_skb
2.01% skb_release_data rt_spin_unlock skb_release_data
1.94% rt_spin_unlock get_page_from_freelist __alloc_pages
1.85% __alloc_skb migrate_enable unix_stream_sendmsg
1.68% __schedule skb_release_data _raw_spin_lock_irqsave
1.67% unix_stream_sendmsg __schedule free_pcppages_bulk
1.50% free_pcppages_bulk unix_stream_sendmsg __slab_free
1.38% migrate_enable free_pcppages_bulk __fget_light
1.24% __fget_light __alloc_pages vfs_write
1.16% __slab_free migrate_disable __schedule
1.14% __alloc_pages __fget_light get_page_from_freelist
1.10% fsnotify __slab_free new_sync_write
1.07% kfree fsnotify fsnotify

5.13.0.g60ab3ed-tip-rt-slub ___slab_alloc() consumes 3.90%
0.40 │ mov 0x28(%r13),%edx
0.42 │ add %r15,%rdx
│ __swab():
│ #endif

│ static __always_inline unsigned long __swab(const unsigned long y)
│ {
│ #if __BITS_PER_LONG == 64
│ return __swab64(y);
0.05 │ mov %rdx,%rax
1.14 │ bswap %rax
│ freelist_ptr():
│ return (void *)((unsigned long)ptr ^ s->random ^ <== CONFIG_SLAB_FREELIST_HARDENED
0.72 │ xor 0xb0(%r13),%rax
65.41 │ xor (%rdx),%rax <== huh? miss = 65% of that 3.9% kernel util?
│ next_tid():
│ return tid + TID_STEP;
0.09 │ addq $0x200,0x48(%r12)
│ ___slab_alloc():
│ * freelist is pointing to the list of objects to be used.
│ * page is pointing to the page from which the objects are obtained.
│ * That page must be frozen for per cpu allocations to work.
│ */
│ VM_BUG_ON(!c->page->frozen);
│ c->freelist = get_freepointer(s, freelist);
0.05 │ mov %rax,0x40(%r12)
│ c->tid = next_tid(c->tid);
│ local_unlock_irqrestore(&s->cpu_slab->lock, flags);

5.13.0.g60ab3ed-tip-rt ___slab_alloc() consumes < 1%
Percent│ }

│ /* must check again c->freelist in case of cpu migration or IRQ */
│ freelist = c->freelist;
0.02 │ a1: mov (%r14),%r13
│ if (freelist)
│ test %r13,%r13
0.02 │ ↓ je 460
│ get_freepointer():
│ return freelist_dereference(s, object + s->offset);
0.23 │ ad: mov 0x28(%r12),%edx
0.18 │ add %r13,%rdx
│ __swab():
│ #endif

│ static __always_inline unsigned long __swab(const unsigned long y)
│ {
│ #if __BITS_PER_LONG == 64
│ return __swab64(y);
0.06 │ mov %rdx,%rax
1.16 │ bswap %rax
│ freelist_ptr():
│ return (void *)((unsigned long)ptr ^ s->random ^
0.23 │ xor 0xb0(%r12),%rax
35.25 │ xor (%rdx),%rax <== 35% of < 1% kernel util
│ next_tid():
│ return tid + TID_STEP;
0.28 │ addq $0x200,0x8(%r14)
│ ___slab_alloc():
│ * freelist is pointing to the list of objects to be used.
│ * page is pointing to the page from which the objects are obtained.
│ * That page must be frozen for per cpu allocations to work.
│ */
│ VM_BUG_ON(!c->page->frozen);
│ c->freelist = get_freepointer(s, freelist);

5.13.0.g60ab3ed-tip-slub ___slab_alloc() also consumes < 1%
Percent│ load_freelist:

│ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
0.28 │ 84: add this_cpu_off,%rax
│ get_freepointer():
│ return freelist_dereference(s, object + s->offset);
0.14 │ mov 0x28(%r14),%eax
│ ___slab_alloc():
│ * freelist is pointing to the list of objects to be used.
│ * page is pointing to the page from which the objects are obtained.
│ * That page must be frozen for per cpu allocations to work.
│ */
│ VM_BUG_ON(!c->page->frozen);
│ c->freelist = get_freepointer(s, freelist);
34.36 │ mov 0x0(%r13,%rax,1),%rax
│ next_tid():
│ return tid + TID_STEP;
0.10 │ addq $0x1,0x8(%r12)
│ ___slab_alloc():
│ c->freelist = get_freepointer(s, freelist);
0.04 │ mov %rax,(%r12)
│ c->tid = next_tid(c->tid);
│ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
0.12 │ mov (%r14),%rax
0.03 │ add this_cpu_off,%rax
│ arch_local_irq_restore():
│ return arch_irqs_disabled_flags(flags);
│ }