[PATCH v2 05/11] x86/cpu: Clean up SRSO return thunk mess

From: Peter Zijlstra
Date: Mon Aug 14 2023 - 08:20:33 EST


Use the existing configurable return thunk. There is absolute no
justification for having created this __x86_return_thunk alternative.

To clarify, the whole thing looks like:

Zen3/4 does:

srso_alias_untrain_ret:
nop2
lfence
jmp srso_alias_return_thunk
int3

srso_alias_safe_ret: // aliasses srso_alias_untrain_ret just so
add $8, %rsp
ret
int3

srso_alias_return_thunk:
call srso_alias_safe_ret
ud2

While Zen1/2 does:

srso_untrain_ret:
movabs $foo, %rax
lfence
call srso_safe_ret (jmp srso_return_thunk ?)
int3

srso_safe_ret: // embedded in movabs instruction
add $8,%rsp
ret
int3

srso_return_thunk:
call srso_safe_ret
ud2

While retbleed does:

zen_untrain_ret:
test $0xcc, %bl
lfence
jmp zen_return_thunk
int3

zen_return_thunk: // embedded in the test instruction
ret
int3

Where Zen1/2 flush the BTB entry using the instruction decoder trick
(test,movabs) Zen3/4 use instruction aliasing. SRSO adds RSB (RAP in
AMD speak) stuffing to force speculation into a trap an cause a
mis-predict.

Pick one of three options at boot (evey function can only ever return
once).

Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
arch/x86/include/asm/nospec-branch.h | 6 ++++
arch/x86/kernel/cpu/bugs.c | 8 ++++--
arch/x86/kernel/vmlinux.lds.S | 2 -
arch/x86/lib/retpoline.S | 45 ++++++++++++++++++++++-------------
tools/objtool/arch/x86/decode.c | 2 -
5 files changed, 43 insertions(+), 20 deletions(-)

--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -342,9 +342,15 @@ extern retpoline_thunk_t __x86_indirect_
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];

extern void __x86_return_thunk(void);
+
+extern void zen_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
extern void zen_untrain_ret(void);
extern void srso_untrain_ret(void);
extern void srso_untrain_ret_alias(void);
+
extern void entry_ibpb(void);

extern void (*x86_return_thunk)(void);
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1034,6 +1034,7 @@ static void __init retbleed_select_mitig
case RETBLEED_MITIGATION_UNRET:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
+ x86_return_thunk = zen_return_thunk;

if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -2451,10 +2452,13 @@ static void __init srso_select_mitigatio
*/
setup_force_cpu_cap(X86_FEATURE_RETHUNK);

- if (boot_cpu_data.x86 == 0x19)
+ if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- else
+ x86_return_thunk = srso_alias_return_thunk;
+ } else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
+ x86_return_thunk = srso_return_thunk;
+ }
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -521,7 +521,7 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif

#ifdef CONFIG_RETHUNK
-. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((zen_return_thunk & 0x3f) == 0, "zen_return_thunk not cacheline-aligned");
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif

--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -151,22 +151,20 @@ SYM_CODE_END(__x86_indirect_jump_thunk_a
.section .text..__x86.rethunk_untrain

SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+ UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
ASM_NOP2
lfence
- jmp __x86_return_thunk
+ jmp srso_alias_return_thunk
SYM_FUNC_END(srso_untrain_ret_alias)
__EXPORT_THUNK(srso_untrain_ret_alias)

.section .text..__x86.rethunk_safe
#endif

-/* Needs a definition for the __x86_return_thunk alternative below. */
SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
-#ifdef CONFIG_CPU_SRSO
lea 8(%_ASM_SP), %_ASM_SP
UNWIND_HINT_FUNC
-#endif
ANNOTATE_UNRET_SAFE
ret
int3
@@ -174,9 +172,16 @@ SYM_FUNC_END(srso_safe_ret_alias)

.section .text..__x86.return_thunk

+SYM_CODE_START(srso_alias_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret_alias
+ ud2
+SYM_CODE_END(srso_alias_return_thunk)
+
/*
* Safety details here pertain to the AMD Zen{1,2} microarchitecture:
- * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for
+ * 1) The RET at zen_return_thunk must be on a 64 byte boundary, for
* alignment within the BTB.
* 2) The instruction at zen_untrain_ret must contain, and not
* end with, the 0xc3 byte of the RET.
@@ -184,7 +189,7 @@ SYM_FUNC_END(srso_safe_ret_alias)
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__ret - zen_untrain_ret), 0xcc
+ .skip 64 - (zen_return_thunk - zen_untrain_ret), 0xcc
SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
@@ -192,16 +197,16 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL,
*
* TEST $0xcc, %bl
* LFENCE
- * JMP __x86_return_thunk
+ * JMP zen_return_thunk
*
* Executing the TEST instruction has a side effect of evicting any BTB
* prediction (potentially attacker controlled) attached to the RET, as
- * __x86_return_thunk + 1 isn't an instruction boundary at the moment.
+ * zen_return_thunk + 1 isn't an instruction boundary at the moment.
*/
.byte 0xf6

/*
- * As executed from __x86_return_thunk, this is a plain RET.
+ * As executed from zen_return_thunk, this is a plain RET.
*
* As part of the TEST above, RET is the ModRM byte, and INT3 the imm8.
*
@@ -213,13 +218,13 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL,
* With SMT enabled and STIBP active, a sibling thread cannot poison
* RET's prediction to a type of its choice, but can evict the
* prediction due to competitive sharing. If the prediction is
- * evicted, __x86_return_thunk will suffer Straight Line Speculation
+ * evicted, zen_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(zen_return_thunk, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__ret)
+SYM_CODE_END(zen_return_thunk)

/*
* Ensure the TEST decoding / BTB invalidation is complete.
@@ -230,7 +235,7 @@ SYM_CODE_END(__ret)
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __ret
+ jmp zen_return_thunk
int3
SYM_FUNC_END(zen_untrain_ret)
__EXPORT_THUNK(zen_untrain_ret)
@@ -256,6 +261,7 @@ SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLO
ret
int3
int3
+ /* end of movabs */
lfence
call srso_safe_ret
ud2
@@ -263,12 +269,19 @@ SYM_CODE_END(srso_safe_ret)
SYM_FUNC_END(srso_untrain_ret)
__EXPORT_THUNK(srso_untrain_ret)

-SYM_CODE_START(__x86_return_thunk)
+SYM_CODE_START(srso_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
- ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
- "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
+ call srso_safe_ret
ud2
+SYM_CODE_END(srso_return_thunk)
+
+SYM_CODE_START(__x86_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)

--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -829,6 +829,6 @@ bool arch_is_rethunk(struct symbol *sym)

bool arch_is_embedded_insn(struct symbol *sym)
{
- return !strcmp(sym->name, "__ret") ||
+ return !strcmp(sym->name, "zen_return_thunk") ||
!strcmp(sym->name, "srso_safe_ret");
}