[PATCH 4/4] x86/percpu: Use C for percpu read/write accessors

From: Uros Bizjak
Date: Wed Oct 04 2023 - 10:52:10 EST


The percpu code mostly uses inline assembly. Using segment qualifiers
allows to use C code instead, which enables the compiler to perform
various optimizations (e.g. propagation of memory arguments). Convert
percpu read and write accessors to C code, so the memory argument can
be propagated to the instruction that uses this argument.

Some examples of propagations:

a) into sign/zero extensions:

110b54: 65 0f b6 05 00 00 00 movzbl %gs:0x0(%rip),%eax
11ab90: 65 0f b6 15 00 00 00 movzbl %gs:0x0(%rip),%edx
14484a: 65 0f b7 35 00 00 00 movzwl %gs:0x0(%rip),%esi
1a08a9: 65 0f b6 43 78 movzbl %gs:0x78(%rbx),%eax
1a08f9: 65 0f b6 43 78 movzbl %gs:0x78(%rbx),%eax

4ab29a: 65 48 63 15 00 00 00 movslq %gs:0x0(%rip),%rdx
4be128: 65 4c 63 25 00 00 00 movslq %gs:0x0(%rip),%r12
547468: 65 48 63 1f movslq %gs:(%rdi),%rbx
5474e7: 65 48 63 0a movslq %gs:(%rdx),%rcx
54d05d: 65 48 63 0d 00 00 00 movslq %gs:0x0(%rip),%rcx

b) into compares:

b40804: 65 f7 05 00 00 00 00 testl $0xf0000,%gs:0x0(%rip)
b487e8: 65 f7 05 00 00 00 00 testl $0xf0000,%gs:0x0(%rip)
b6f14c: 65 f6 05 00 00 00 00 testb $0x1,%gs:0x0(%rip)
bac1b8: 65 f6 05 00 00 00 00 testb $0x1,%gs:0x0(%rip)
df2244: 65 f7 05 00 00 00 00 testl $0xff00,%gs:0x0(%rip)

9a7517: 65 80 3d 00 00 00 00 cmpb $0x0,%gs:0x0(%rip)
b282ba: 65 44 3b 35 00 00 00 cmp %gs:0x0(%rip),%r14d
b48f61: 65 66 83 3d 00 00 00 cmpw $0x8,%gs:0x0(%rip)
b493fe: 65 80 38 00 cmpb $0x0,%gs:(%rax)
b73867: 65 66 83 3d 00 00 00 cmpw $0x8,%gs:0x0(%rip)

c) into other insns:

65ec02: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx
6c98ac: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx
9aafaf: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx
b45868: 65 0f 48 35 00 00 00 cmovs %gs:0x0(%rip),%esi
d276f8: 65 0f 44 15 00 00 00 cmove %gs:0x0(%rip),%edx

The above propagations result in the following code size
improvements for current mainline kernel (with the default config),
compiled with

gcc (GCC) 12.3.1 20230508 (Red Hat 12.3.1-1)

text data bss dec hex filename
25508862 4386540 808388 30703790 1d480ae vmlinux-vanilla.o
25500922 4386532 808388 30695842 1d461a2 vmlinux-new.o

The conversion of other read-modify-write instructions does not bring us any
benefits, the compiler has some problems when constructing RMW instructions
from the generic code and easily misses some opportunities.

Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Nadav Amit <namit@xxxxxxxxxx>
Cc: Brian Gerst <brgerst@xxxxxxxxx>
Cc: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
Co-developed-by: Nadav Amit <namit@xxxxxxxxxx>
Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
---
arch/x86/include/asm/percpu.h | 65 +++++++++++++++++++++++++++++------
1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index da451202a1b9..60ea7755c0fe 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -400,13 +400,66 @@ do { \
#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)

+#ifdef CONFIG_USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(qual, pcp) \
+({ \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
+})
+
+#define __raw_cpu_write(qual, pcp, val) \
+do { \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+} while (0)
+
+#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#endif
+
+#else /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp)
#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp)
#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp)
-
#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val)
#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val)
#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val)
+
+#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
+#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
+#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
+#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
+#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
+
+#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
+#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
+#endif
+
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -432,12 +485,6 @@ do { \
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)

-#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
-#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
-#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
-#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
-#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
-#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
@@ -476,8 +523,6 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
-#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -486,8 +531,6 @@ do { \
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)

-#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
-#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
--
2.41.0