[RFC][PATCH 02/17] x86/cpu: Clean up SRSO return thunk mess

From: Peter Zijlstra
Date: Wed Aug 09 2023 - 03:27:34 EST


Use the existing configurable return thunk. There is absolute no
justification for having created this __x86_return_thunk alternative.

To clarify, the whole thing looks like:

Zen3/4 does:

srso_alias_untrain_ret:
nop2
lfence
jmp srso_alias_return_thunk
int3

srso_alias_safe_ret: // aliasses srso_alias_untrain_ret just so
add $8, %rsp
ret
int3

srso_alias_return_thunk:
call srso_alias_safe_ret
ud2

While Zen1/2 does:

srso_untrain_ret:
movabs $foo, %rax
lfence
call srso_safe_ret (jmp srso_return_thunk ?)
int3

srso_safe_ret: // embedded in movabs immediate
add $8,%rsp
ret
int3

srso_return_thunk:
call srso_safe_ret
ud2

While retbleed does:

zen_untrain_ret:
test $0xcc, %bl
lfence
jmp zen_return_thunk
int3

zen_return_thunk: // embedded in the test instruction
ret
int3

Where Zen1/2 flush the BTB using the instruction decoder trick
(test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in
AMD speak) stuffing to force a return mis-predict.

That is; the AMD retbleed is a form of Speculative-Type-Confusion
where the branch predictor is trained to use the BTB to predict the
RET address, while AMD inception/SRSO is a form of
Speculative-Type-Confusion where another instruction is trained to be
treated like a CALL instruction and poison the RSB (RAP).

Pick one of three options at boot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
arch/x86/include/asm/nospec-branch.h | 4 +++
arch/x86/kernel/cpu/bugs.c | 7 ++++--
arch/x86/kernel/vmlinux.lds.S | 2 -
arch/x86/lib/retpoline.S | 37 ++++++++++++++++++++++++-----------
4 files changed, 36 insertions(+), 14 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -342,9 +342,13 @@ extern retpoline_thunk_t __x86_indirect_
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];

extern void __x86_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
extern void zen_untrain_ret(void);
extern void srso_untrain_ret(void);
extern void srso_untrain_ret_alias(void);
+
extern void entry_ibpb(void);

extern void (*x86_return_thunk)(void);
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2305,10 +2305,13 @@ static void __init srso_select_mitigatio
*/
setup_force_cpu_cap(X86_FEATURE_RETHUNK);

- if (boot_cpu_data.x86 == 0x19)
+ if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- else
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -523,7 +523,7 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif

#ifdef CONFIG_RETHUNK
-. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif

--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -151,10 +151,11 @@ SYM_CODE_END(__x86_indirect_jump_thunk_a
.section .text.__x86.rethunk_untrain

SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+ UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
ASM_NOP2
lfence
- jmp __x86_return_thunk
+ jmp srso_alias_return_thunk
SYM_FUNC_END(srso_untrain_ret_alias)
__EXPORT_THUNK(srso_untrain_ret_alias)

@@ -184,7 +185,7 @@ SYM_FUNC_END(srso_safe_ret_alias)
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__ret - zen_untrain_ret), 0xcc
+ .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc
SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
@@ -216,10 +217,10 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL,
* evicted, __x86_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__ret)
+SYM_CODE_END(__x86_return_thunk)

/*
* Ensure the TEST decoding / BTB invalidation is complete.
@@ -230,11 +231,13 @@ SYM_CODE_END(__ret)
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __ret
+ jmp __x86_return_thunk
int3
SYM_FUNC_END(zen_untrain_ret)
__EXPORT_THUNK(zen_untrain_ret)

+EXPORT_SYMBOL(__x86_return_thunk)
+
/*
* SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
* above. On kernel entry, srso_untrain_ret() is executed which is a
@@ -257,6 +260,7 @@ SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLO
int3
int3
int3
+ /* end of movabs */
lfence
call srso_safe_ret
int3
@@ -264,12 +268,23 @@ SYM_CODE_END(srso_safe_ret)
SYM_FUNC_END(srso_untrain_ret)
__EXPORT_THUNK(srso_untrain_ret)

-SYM_FUNC_START(__x86_return_thunk)
- ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
- "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
- int3
-SYM_CODE_END(__x86_return_thunk)
-EXPORT_SYMBOL(__x86_return_thunk)
+/*
+ * Both these do an unbalanced CALL to mess up the RSB, terminate with UD2
+ * to indicate noreturn.
+ */
+SYM_CODE_START(srso_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_return_thunk)
+
+SYM_CODE_START(srso_alias_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret_alias
+ ud2
+SYM_CODE_END(srso_alias_return_thunk)

#endif /* CONFIG_RETHUNK */