[patch 23/60] x86/entry/64: Make cpu_entry_area.tss read-only

From: Thomas Gleixner
Date: Mon Dec 04 2017 - 12:04:56 EST


From: Andy Lutomirski <luto@xxxxxxxxxx>

The TSS is a fairly juicy target for exploits, and, now that the TSS
is in the cpu_entry_area, it's no longer protected by kASLR. Make it
read-only on x86_64.

On x86_32, it can't be RO because it's written by the CPU during task
switches, and we use a task gate for double faults. I'd also be
nervous about errata if we tried to make it RO even on configurations
without double fault handling.

[ tglx: AMD confirmed that there is no problem on 64bit with TSS RO. So
it's probably safe to assume that it's a non issue, though Intel
might have been creative in that area. Still waiting for
confirmation. ]

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Brian Gerst <brgerst@xxxxxxxxx>
Cc: David Laight <David.Laight@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Link: https://lkml.kernel.org/r/7d2f65f86a46e3489ba996932554485c3d345632.1512109321.git.luto@xxxxxxxxxx

---
arch/x86/entry/entry_32.S | 4 ++--
arch/x86/entry/entry_64.S | 8 ++++----
arch/x86/include/asm/fixmap.h | 13 +++++++++----
arch/x86/include/asm/processor.h | 17 ++++++++---------
arch/x86/include/asm/switch_to.h | 4 ++--
arch/x86/include/asm/thread_info.h | 2 +-
arch/x86/kernel/asm-offsets.c | 5 ++---
arch/x86/kernel/asm-offsets_32.c | 4 ++--
arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++----------
arch/x86/kernel/ioport.c | 2 +-
arch/x86/kernel/process.c | 6 +++---
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/traps.c | 4 ++--
arch/x86/lib/delay.c | 4 ++--
arch/x86/xen/enlighten_pv.c | 2 +-
16 files changed, 60 insertions(+), 48 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -942,7 +942,7 @@ ENTRY(debug)

/* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx
- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack
@@ -986,7 +986,7 @@ ENTRY(nmi)

/* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx
- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -158,7 +158,7 @@ END(native_usergs_sysret64)
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)

/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
+#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA

ENTRY(entry_SYSCALL_64_trampoline)
@@ -394,7 +394,7 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp

pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -723,7 +723,7 @@ GLOBAL(swapgs_restore_regs_and_return_to
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp

/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -938,7 +938,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
/*
* Exception entry points.
*/
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)

/*
* Switch to the thread stack. This is called with the IRET frame and
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -56,9 +56,14 @@ struct cpu_entry_area {
char gdt[PAGE_SIZE];

/*
- * The GDT is just below cpu_tss and thus serves (on x86_64) as a
- * a read-only guard page for the SYSENTER stack at the bottom
- * of the TSS region.
+ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct SYSENTER_stack_page SYSENTER_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;

@@ -241,7 +246,7 @@ static inline struct cpu_entry_area *get

static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
{
- return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack;
+ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
}

#endif /* !__ASSEMBLY__ */
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -341,13 +341,11 @@ struct SYSENTER_stack {
unsigned long words[64];
};

-struct tss_struct {
- /*
- * Space for the temporary SYSENTER stack, used for SYSENTER
- * and the entry trampoline as well.
- */
- struct SYSENTER_stack SYSENTER_stack;
+struct SYSENTER_stack_page {
+ struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);

+struct tss_struct {
/*
* The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
@@ -364,7 +362,7 @@ struct tss_struct {
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);

-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);

/*
* sizeof(unsigned long) coming from an extra "long" at the end
@@ -379,7 +377,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif

/*
@@ -539,7 +538,7 @@ static inline void native_set_iopl_mask(
static inline void
native_load_sp0(unsigned long sp0)
{
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}

static inline void native_swapgs(void)
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,10 +79,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;

- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_fram
#else /* !__ASSEMBLY__ */

#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif

#endif
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -94,10 +94,9 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));

- OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
-
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
}
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -47,8 +47,8 @@ void foo(void)
BLANK();

/* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- offsetofend(struct tss_struct, SYSENTER_stack));
+ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));

#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -511,6 +511,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char,
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif

+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+ SYSENTER_stack_storage);
+
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
@@ -524,23 +527,29 @@ static void __init setup_cpu_entry_area(
#ifdef CONFIG_X86_64
extern char _entry_trampoline[];

- /* On 64-bit systems, we use a read-only fixmap GDT. */
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the GDT
- * is read-only, that will triple fault.
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
*
- * On Xen PV, the GDT must be read-only because the hypervisor requires
- * it.
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
*/
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
#endif

__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+ PAGE_KERNEL);

/*
* The Intel SDM says (Volume 3, 7.2.1):
@@ -563,9 +572,9 @@ static void __init setup_cpu_entry_area(
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
- &per_cpu(cpu_tss, cpu),
+ &per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
- PAGE_KERNEL);
+ tss_prot);

#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
@@ -1330,7 +1339,7 @@ void enable_sep_cpu(void)
return;

cpu = get_cpu();
- tss = &per_cpu(cpu_tss, cpu);
+ tss = &per_cpu(cpu_tss_rw, cpu);

/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1600,7 +1609,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();

- t = &per_cpu(cpu_tss, cpu);
+ t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);

#ifdef CONFIG_NUMA
@@ -1692,7 +1701,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);

wait_for_master_cpu(cpu);

--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(cpu_tss, get_cpu());
+ tss = &per_cpu(cpu_tss_rw, get_cpu());

if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@@ -82,7 +82,7 @@
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
};
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);

DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk
struct fpu *fpu = &t->fpu;

if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());

t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ EXPORT_SYMBOL_GPL(start_thread);
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);

/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -399,7 +399,7 @@ void compat_start_thread(struct pt_regs
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);

WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -365,7 +365,7 @@ dotraplinkage void do_double_fault(struc
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
- struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;

/*
* regs->sp points to the failing IRET frame on the
@@ -654,7 +654,7 @@ struct bad_iret_stack *fixup_bad_iret(st
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
- (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;

/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long _
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);

/*
- * Use cpu_tss as a cacheline-aligned, seldomly
+ * Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);

/*
* AMD, like Intel, supports the EAX hint and EAX=0xf
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long s
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}

void xen_set_iopl_mask(unsigned mask)