[RFC][PATCH 24/24] x86/pti: Defer CR3 switch to C code for non-IST and syscall entries

From: Alexandre Chartre
Date: Mon Nov 09 2020 - 06:23:34 EST


With PTI, syscall/interrupt/exception entries switch the CR3 register
to change the page-table in assembly code. Move the CR3 register switch
inside the C code of syscall/interrupt/exception entry handlers.

Signed-off-by: Alexandre Chartre <alexandre.chartre@xxxxxxxxxx>
---
arch/x86/entry/common.c | 15 ++++++++++++---
arch/x86/entry/entry_64.S | 23 +++++------------------
arch/x86/entry/entry_64_compat.S | 22 ----------------------
arch/x86/include/asm/entry-common.h | 14 ++++++++++++++
arch/x86/include/asm/idtentry.h | 25 ++++++++++++++++++++-----
arch/x86/kernel/cpu/mce/core.c | 2 ++
arch/x86/kernel/nmi.c | 2 ++
arch/x86/kernel/traps.c | 6 ++++++
arch/x86/mm/fault.c | 9 +++++++--
9 files changed, 68 insertions(+), 50 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ead6a4c72e6a..3f4788dbbde7 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -51,6 +51,7 @@ __visible noinstr void return_from_fork(struct pt_regs *regs,
regs->ax = 0;
}
syscall_exit_to_user_mode(regs);
+ switch_to_user_cr3();
}

static __always_inline void run_syscall(sys_call_ptr_t sysfunc,
@@ -74,6 +75,7 @@ static __always_inline void run_syscall(sys_call_ptr_t sysfunc,
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
+ switch_to_kernel_cr3();
nr = syscall_enter_from_user_mode(regs, nr);

instrumentation_begin();
@@ -91,12 +93,14 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)

instrumentation_end();
syscall_exit_to_user_mode(regs);
+ switch_to_user_cr3();
}
#endif

#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
{
+ switch_to_kernel_cr3();
if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT;

@@ -131,11 +135,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)

do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
+ switch_to_user_cr3();
}

-static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs, long nr)
{
- unsigned int nr = syscall_32_enter(regs);
int res;

/*
@@ -179,6 +183,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
{
+ unsigned int nr = syscall_32_enter(regs);
+ bool syscall_done;
+
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
@@ -194,7 +201,9 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
regs->ip = landing_pad;

/* Invoke the syscall. If it failed, keep it simple: use IRET. */
- if (!__do_fast_syscall_32(regs))
+ syscall_done = __do_fast_syscall_32(regs, nr);
+ switch_to_user_cr3();
+ if (!syscall_done)
return 0;

#ifdef CONFIG_X86_64
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 797effbe65b6..4be15a5ffe68 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -98,7 +98,6 @@ SYM_CODE_START(entry_SYSCALL_64)
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
@@ -192,18 +191,14 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- POP_REGS pop_rdi=0 skip_r11rcx=1
+ POP_REGS skip_r11rcx=1

/*
- * We are on the trampoline stack. All regs except RDI are live.
* We are on the trampoline stack. All regs except RSP are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER

- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
-
- popq %rdi
movq RSP-ORIG_RAX(%rsp), %rsp
USERGS_SYSRET64
SYM_CODE_END(entry_SYSCALL_64)
@@ -321,7 +316,6 @@ SYM_CODE_END(ret_from_fork)
swapgs
cld
FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -592,19 +586,15 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
- POP_REGS pop_rdi=0
+ POP_REGS
+ addq $8, %rsp /* skip regs->orig_ax */

/*
- * We are on the trampoline stack. All regs except RDI are live.
+ * We are on the trampoline stack. All regs are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER

- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
-
- /* Restore RDI. */
- popq %rdi
- addq $8, %rsp /* skip regs->orig_ax */
SWAPGS
INTERRUPT_RETURN

@@ -899,8 +889,6 @@ SYM_CODE_START_LOCAL(error_entry)
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
- /* We have user CR3. Change to kernel CR3. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -959,11 +947,10 @@ SYM_CODE_START_LOCAL(error_entry)
.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user
- * gsbase and CR3. Switch to kernel gsbase and CR3:
+ * gsbase and CR3. Switch to kernel gsbase.
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

/*
* Pretend that the exception came from user mode: set up pt_regs
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 541fdaf64045..a6fb5807bf42 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -51,10 +51,6 @@ SYM_CODE_START(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS

- pushq %rax
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
- popq %rax
-
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

/* Construct struct pt_regs on stack */
@@ -204,9 +200,6 @@ SYM_CODE_START(entry_SYSCALL_compat)
/* Stash user ESP */
movl %esp, %r8d

- /* Use %rsp as scratch reg. User ESP is stashed in r8 */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
/* Switch to the kernel stack */
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

@@ -291,18 +284,6 @@ sysret32_from_system_call:
* code. We zero R8-R10 to avoid info leaks.
*/
movq RSP-ORIG_RAX(%rsp), %rsp
-
- /*
- * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
- * on the process stack which is not mapped to userspace and
- * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
- * switch until after after the last reference to the process
- * stack.
- *
- * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
- */
- SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
-
xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
@@ -357,9 +338,6 @@ SYM_CODE_START(entry_INT80_compat)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */

- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-
/* In the Xen PV case we already run on the thread stack. */
ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV

diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index b75e9230c990..32e9f3159131 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -157,10 +157,24 @@ static __always_inline void switch_to_user_cr3(void)
native_write_cr3(cr3);
}

+static __always_inline void kernel_pgtable_enter(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ switch_to_kernel_cr3();
+}
+
+static __always_inline void kernel_pgtable_exit(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ switch_to_user_cr3();
+}
+
#else /* CONFIG_PAGE_TABLE_ISOLATION */

static inline void switch_to_kernel_cr3(void) {}
static inline void switch_to_user_cr3(void) {}
+static inline void kernel_pgtable_enter(struct pt_regs *regs) {};
+static inline void kernel_pgtable_exit(struct pt_regs *regs) {};

#endif /* CONFIG_PAGE_TABLE_ISOLATION */

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 647af7ea3bf1..d8bfcd8a4db4 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -147,12 +147,15 @@ static __always_inline void __##func(struct pt_regs *regs); \
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- irqentry_state_t state = irqentry_enter(regs); \
+ irqentry_state_t state; \
\
+ kernel_pgtable_enter(regs); \
+ state = irqentry_enter(regs); \
instrumentation_begin(); \
run_idt(__##func, regs); \
instrumentation_end(); \
irqentry_exit(regs, state); \
+ kernel_pgtable_exit(regs); \
} \
\
static __always_inline void __##func(struct pt_regs *regs)
@@ -194,12 +197,15 @@ static __always_inline void __##func(struct pt_regs *regs, \
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
- irqentry_state_t state = irqentry_enter(regs); \
+ irqentry_state_t state; \
\
+ kernel_pgtable_enter(regs); \
+ state = irqentry_enter(regs); \
instrumentation_begin(); \
run_idt_errcode(__##func, regs, error_code); \
instrumentation_end(); \
irqentry_exit(regs, state); \
+ kernel_pgtable_exit(regs); \
} \
\
static __always_inline void __##func(struct pt_regs *regs, \
@@ -290,8 +296,10 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
- irqentry_state_t state = irqentry_enter(regs); \
+ irqentry_state_t state; \
\
+ kernel_pgtable_enter(regs); \
+ state = irqentry_enter(regs); \
instrumentation_begin(); \
irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
@@ -300,6 +308,7 @@ __visible noinstr void func(struct pt_regs *regs, \
irq_exit_rcu(); \
instrumentation_end(); \
irqentry_exit(regs, state); \
+ kernel_pgtable_exit(regs); \
} \
\
static __always_inline void __##func(struct pt_regs *regs, u8 vector)
@@ -333,8 +342,10 @@ static void __##func(struct pt_regs *regs); \
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- irqentry_state_t state = irqentry_enter(regs); \
+ irqentry_state_t state; \
\
+ kernel_pgtable_enter(regs); \
+ state = irqentry_enter(regs); \
instrumentation_begin(); \
irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
@@ -342,6 +353,7 @@ __visible noinstr void func(struct pt_regs *regs) \
irq_exit_rcu(); \
instrumentation_end(); \
irqentry_exit(regs, state); \
+ kernel_pgtable_exit(regs); \
} \
\
static noinline void __##func(struct pt_regs *regs)
@@ -362,8 +374,10 @@ static __always_inline void __##func(struct pt_regs *regs); \
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- irqentry_state_t state = irqentry_enter(regs); \
+ irqentry_state_t state; \
\
+ kernel_pgtable_enter(regs); \
+ state = irqentry_enter(regs); \
instrumentation_begin(); \
__irq_enter_raw(); \
kvm_set_cpu_l1tf_flush_l1d(); \
@@ -371,6 +385,7 @@ __visible noinstr void func(struct pt_regs *regs) \
__irq_exit_raw(); \
instrumentation_end(); \
irqentry_exit(regs, state); \
+ kernel_pgtable_exit(regs); \
} \
\
static __always_inline void __##func(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 827088f981c6..e1ae901c4925 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2037,9 +2037,11 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
{
unsigned long dr7;

+ switch_to_kernel_cr3();
dr7 = local_db_save();
run_idt(exc_machine_check_user, regs);
local_db_restore(dr7);
+ switch_to_user_cr3();
}
#else
/* 32bit unified entry point */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 23c92ffd58fe..063474f5b5fe 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -542,8 +542,10 @@ DEFINE_IDTENTRY_NMI(exc_nmi)

__visible noinstr void exc_nmi_user(struct pt_regs *regs)
{
+ switch_to_kernel_cr3();
handle_nmi(regs);
mds_user_clear_cpu_buffers();
+ switch_to_user_cr3();
}

void stop_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1801791748b8..6c78eeb60d19 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -255,11 +255,13 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op)
if (!user_mode(regs) && handle_bug(regs))
return;

+ kernel_pgtable_enter(regs);
state = irqentry_enter(regs);
instrumentation_begin();
run_idt(handle_invalid_op, regs);
instrumentation_end();
irqentry_exit(regs, state);
+ kernel_pgtable_exit(regs);
}

DEFINE_IDTENTRY(exc_coproc_segment_overrun)
@@ -663,11 +665,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
* including NMI.
*/
if (user_mode(regs)) {
+ switch_to_kernel_cr3();
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
run_idt(do_int3_user, regs);
instrumentation_end();
irqentry_exit_to_user_mode(regs);
+ switch_to_user_cr3();
} else {
bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
@@ -1001,7 +1005,9 @@ DEFINE_IDTENTRY_DEBUG(exc_debug)
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
+ switch_to_kernel_cr3();
run_idt_errcode(exc_debug_user, regs, debug_read_clear_dr6());
+ switch_to_user_cr3();
}
#else
/* 32 bit does not have separate entry points. */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b9d03603d95d..613a864840ab 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1440,9 +1440,11 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
- unsigned long address = read_cr2();
+ unsigned long address;
irqentry_state_t state;

+ kernel_pgtable_enter(regs);
+ address = read_cr2();
prefetchw(&current->mm->mmap_lock);

/*
@@ -1466,8 +1468,10 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* The async #PF handling code takes care of idtentry handling
* itself.
*/
- if (kvm_handle_async_pf(regs, (u32)address))
+ if (kvm_handle_async_pf(regs, (u32)address)) {
+ kernel_pgtable_exit(regs);
return;
+ }

/*
* Entry handling for valid #PF from kernel mode is slightly
@@ -1486,4 +1490,5 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
instrumentation_end();

irqentry_exit(regs, state);
+ kernel_pgtable_exit(regs);
}
--
2.18.4