[PATCH RFC 18/43] x86/percpu: Use PC-relative addressing for percpu variable references

From: Hou Wenlong
Date: Fri Apr 28 2023 - 05:54:31 EST


For PIE binary, all symbol references are PC-relative addressing, even
for percpu variable. So to keep compatible with PIE, add %rip suffix in
percpu assembly macros if PIE is enabled. However, relocation of percpu
variable references is broken now for PIE. It would be fixed later.

Suggested-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
Cc: Thomas Garnier <thgarnie@xxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
---
arch/x86/entry/calling.h | 17 ++++++++++++----
arch/x86/include/asm/nospec-branch.h | 10 +++++-----
arch/x86/include/asm/percpu.h | 29 +++++++++++++++++++++++++---
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/kvm.c | 21 ++++++++++++++++----
arch/x86/lib/cmpxchg16b_emu.S | 8 ++++----
arch/x86/xen/xen-asm.S | 10 +++++-----
7 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f6907627172b..11328578741d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -173,7 +173,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm

#define THIS_CPU_user_pcid_flush_mask \
- PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+ PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)

.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
@@ -370,8 +370,8 @@ For 32-bit we have the following conventions - kernel is built with
.endm

.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+ GET_PERCPU_BASE \scratch_reg \save_reg
rdgsbase \save_reg
- GET_PERCPU_BASE \scratch_reg
wrgsbase \scratch_reg
.endm

@@ -407,15 +407,24 @@ For 32-bit we have the following conventions - kernel is built with
* Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
* while running KVM's run loop.
*/
-.macro GET_PERCPU_BASE reg:req
+#ifdef CONFIG_X86_PIE
+.macro GET_PERCPU_BASE reg:req scratch_reg:req
+ LOAD_CPU_AND_NODE_SEG_LIMIT \reg
+ andq $VDSO_CPUNODE_MASK, \reg
+ leaq __per_cpu_offset(%rip), \scratch_reg
+ movq (\scratch_reg, \reg, 8), \reg
+.endm
+#else
+.macro GET_PERCPU_BASE reg:req scratch_reg:req
LOAD_CPU_AND_NODE_SEG_LIMIT \reg
andq $VDSO_CPUNODE_MASK, \reg
movq __per_cpu_offset(, \reg, 8), \reg
.endm
+#endif /* CONFIG_X86_PIE */

#else

-.macro GET_PERCPU_BASE reg:req
+.macro GET_PERCPU_BASE reg:req scratch_reg:req
movq pcpu_unit_offsets(%rip), \reg
.endm

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index edb2b0cb8efe..d8fd935e0697 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -59,13 +59,13 @@

#ifdef CONFIG_CALL_THUNKS_DEBUG
# define CALL_THUNKS_DEBUG_INC_CALLS \
- incq %gs:__x86_call_count;
+ incq %gs:(__x86_call_count)__percpu_rel;
# define CALL_THUNKS_DEBUG_INC_RETS \
- incq %gs:__x86_ret_count;
+ incq %gs:(__x86_ret_count)__percpu_rel;
# define CALL_THUNKS_DEBUG_INC_STUFFS \
- incq %gs:__x86_stuffs_count;
+ incq %gs:(__x86_stuffs_count)__percpu_rel;
# define CALL_THUNKS_DEBUG_INC_CTXSW \
- incq %gs:__x86_ctxsw_count;
+ incq %gs:(__x86_ctxsw_count)__percpu_rel;
#else
# define CALL_THUNKS_DEBUG_INC_CALLS
# define CALL_THUNKS_DEBUG_INC_RETS
@@ -95,7 +95,7 @@
CALL_THUNKS_DEBUG_INC_CALLS

#define INCREMENT_CALL_DEPTH \
- sarq $5, %gs:pcpu_hot + X86_call_depth; \
+ sarq $5, %gs:(pcpu_hot + X86_call_depth)__percpu_rel;\
CALL_THUNKS_DEBUG_INC_CALLS

#define ASM_INCREMENT_CALL_DEPTH \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 13c0d63ed55e..a627a073c6ea 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -4,16 +4,26 @@

#ifdef CONFIG_X86_64
#define __percpu_seg gs
+#ifdef CONFIG_X86_PIE
+#define __percpu_rel (%rip)
+#else
+#define __percpu_rel
+#endif /* CONFIG_X86_PIE */
#else
#define __percpu_seg fs
+#define __percpu_rel
#endif

#ifdef __ASSEMBLY__

#ifdef CONFIG_SMP
-#define PER_CPU_VAR(var) %__percpu_seg:var
+/* Compatible with Position Independent Code */
+#define PER_CPU_VAR(var) %__percpu_seg:(var)##__percpu_rel
+/* Rare absolute reference */
+#define PER_CPU_VAR_ABS(var) %__percpu_seg:var
#else /* ! SMP */
-#define PER_CPU_VAR(var) var
+#define PER_CPU_VAR(var) (var)##__percpu_rel
+#define PER_CPU_VAR_ABS(var) var
#endif /* SMP */

#ifdef CONFIG_X86_64_SMP
@@ -148,10 +158,23 @@ do { \
(typeof(_var))(unsigned long) pfo_val__; \
})

+/*
+ * Position Independent code uses relative addresses only.
+ * The 'P' modifier prevents RIP-relative addressing in GCC,
+ * so use 'a' modifier instead. Howerver, 'P' modifier allows
+ * RIP-relative addressing in Clang but Clang doesn't support
+ * 'a' modifier.
+ */
+#if defined(CONFIG_X86_PIE) && defined(CONFIG_CC_IS_GCC)
+#define __percpu_stable_arg __percpu_arg(a[var])
+#else
+#define __percpu_stable_arg __percpu_arg(P[var])
+#endif
+
#define percpu_stable_op(size, op, _var) \
({ \
__pcpu_type_##size pfo_val__; \
- asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \
+ asm(__pcpu_op2_##size(op, __percpu_stable_arg, "%[val]") \
: [val] __pcpu_reg_##size("=", pfo_val__) \
: [var] "p" (&(_var))); \
(typeof(_var))(unsigned long) pfo_val__; \
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 61f1873d0ff7..1eed50b7d1ac 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -396,7 +396,7 @@ SYM_CODE_START(start_cpu0)
UNWIND_HINT_END_OF_STACK

/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
movq TASK_threadsp(%rcx), %rsp

jmp .Ljump_to_C_code
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1cceac5984da..32d7b201f4f0 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -794,14 +794,27 @@ PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);

extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);

+#ifndef CONFIG_X86_PIE
+#define KVM_CHECK_VCPU_PREEMPTED \
+ "movq __per_cpu_offset(,%rdi,8), %rax;" \
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+#else
+#define KVM_CHECK_VCPU_PREEMPTED \
+ "pushq %rdi;" \
+ "leaq __per_cpu_offset(%rip), %rax;" \
+ "movq (%rax,%rdi,8), %rax;" \
+ "leaq steal_time(%rip), %rdi;" \
+ "cmpb $0, (%rax, %rdi, 1);" \
+ "popq %rdi;"
+#endif
+
/*
* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
* restoring to/from the stack.
*/
-#define PV_VCPU_PREEMPTED_ASM \
- "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
- "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
- "setne %al\n\t"
+#define PV_VCPU_PREEMPTED_ASM \
+ KVM_CHECK_VCPU_PREEMPTED \
+ "setne %al\n\t"

DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
PV_VCPU_PREEMPTED_ASM, .text);
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 33c70c0160ea..891c5e9fd868 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -27,13 +27,13 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
pushfq
cli

- cmpq PER_CPU_VAR((%rsi)), %rax
+ cmpq PER_CPU_VAR_ABS((%rsi)), %rax
jne .Lnot_same
- cmpq PER_CPU_VAR(8(%rsi)), %rdx
+ cmpq PER_CPU_VAR_ABS(8(%rsi)), %rdx
jne .Lnot_same

- movq %rbx, PER_CPU_VAR((%rsi))
- movq %rcx, PER_CPU_VAR(8(%rsi))
+ movq %rbx, PER_CPU_VAR_ABS((%rsi))
+ movq %rcx, PER_CPU_VAR_ABS(8(%rsi))

popfq
mov $1, %al
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 9e5e68008785..448958ddbaf8 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -28,7 +28,7 @@
* non-zero.
*/
SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
RET
SYM_FUNC_END(xen_irq_disable_direct)

@@ -69,7 +69,7 @@ SYM_FUNC_END(check_events)
SYM_FUNC_START(xen_irq_enable_direct)
FRAME_BEGIN
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)

/*
* Preempt here doesn't matter because that will deal with any
@@ -78,7 +78,7 @@ SYM_FUNC_START(xen_irq_enable_direct)
*/

/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending)
jz 1f

call check_events
@@ -97,7 +97,7 @@ SYM_FUNC_END(xen_irq_enable_direct)
* x86 use opposite senses (mask vs enable).
*/
SYM_FUNC_START(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
setz %ah
addb %ah, %ah
RET
@@ -113,7 +113,7 @@ SYM_FUNC_END(xen_read_cr2);

SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
- _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+ _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX
FRAME_END
RET
SYM_FUNC_END(xen_read_cr2_direct);
--
2.31.1