Re: [patch V163 27/51] x86/mm/pti: Populate user PGD

From: Thomas Gleixner
Date: Tue Dec 19 2017 - 19:22:36 EST


On Tue, 19 Dec 2017, Thomas Gleixner wrote:
> On Tue, 19 Dec 2017, Ingo Molnar wrote:
> We don't run out of space, but the 0-day robot triggered a nasty issue.
>
> The fixmap bottom address, which contains the early_ioremap fixmap area, is:
>
> vaddr_bt = FIXADDR_TOP - FIX_BTMAP_BEGIN * PAGE_SIZE
>
> If that address is lower than:
>
> vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
>
> then cleanup_highmap() will happily 0 out the PMD entry for the PTE page of
> FIX_BTMAP. That entry was set up earlier in early_ioremap_init().
>
> As a consequence the first call to __early_set_fixmap() which tries to
> install a PTE for early_ioremap() will crash and burn.
>
> Below is a nasty hack which fixes the problem. Ideally we get all of this
> cpu_entry_stuff out of the fixmap. I'll look into that later, but for now
> the patch 'fixes' the issue.

I had a stab on moving the cpu_entry_area to some other place.

The patch below works, but:

- it breaks i386 build because I have not yet found a way to place the
CPU_ENTRY_AREA_BASE without creating include recursion hell

- it probably does not work on XEN_PV, but I'm too tired now to figure
that out.

Thanks,

tglx

8<-------------------

Documentation/x86/x86_64/mm.txt | 4
arch/x86/events/intel/ds.c | 53 ++++++------
arch/x86/include/asm/desc.h | 1
arch/x86/include/asm/fixmap.h | 89 --------------------
arch/x86/include/asm/pgtable_32_types.h | 6 -
arch/x86/include/asm/pgtable_64_types.h | 49 ++++++-----
arch/x86/kernel/cpu/common.c | 125 ----------------------------
arch/x86/kernel/dumpstack.c | 1
arch/x86/kernel/traps.c | 5 -
arch/x86/mm/Makefile | 2
arch/x86/mm/dump_pagetables.c | 2
arch/x86/mm/kasan_init_64.c | 6 -
arch/x86/mm/pti.c | 39 +++------
arch/x86/xen/mmu_pv.c | 2
b/arch/x86/include/asm/cpu_entry_area.h | 79 ++++++++++++++++++
b/arch/x86/mm/cpu_entry_area.c | 138 ++++++++++++++++++++++++++++++++
16 files changed, 309 insertions(+), 292 deletions(-)

--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,7 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -36,6 +37,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49
... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,36 +76,39 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_MASK (~(PGDIR_SIZE - 1))

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(12800, UL)
-#define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
-#define LDT_PGD_ENTRY _AC(-112, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB _AC(12800, UL)
+#define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+#define LDT_PGD_ENTRY _AC(-112, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
-#define VMALLOC_SIZE_TB _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
-#define LDT_PGD_ENTRY _AC(-3, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB _AC(32, UL)
+#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+#define LDT_PGD_ENTRY _AC(-4, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START vmalloc_base
-#define VMEMMAP_START vmemmap_base
+#define VMALLOC_START vmalloc_base
+#define VMEMMAP_START vmemmap_base
#else
-#define VMALLOC_START __VMALLOC_BASE
-#define VMEMMAP_START __VMEMMAP_BASE
+#define VMALLOC_START __VMALLOC_BASE
+#define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END (-68 * (_AC(1, UL) << 30))

#define EARLY_DYNAMIC_PAGE_TABLES 64

--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code. Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below entry_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct entry_stack_page entry_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ /*
+ * Per CPU debug store for Intel performance monitoring. Wastes a
+ * full page at the moment.
+ */
+ struct debug_store cpu_debug_store;
+ /*
+ * The actual PEBS/BTS buffers must be mapped to user space
+ * Reserve enough fixmap PTEs.
+ */
+ struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
-#include <asm/intel_ds.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -45,57 +44,6 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif

-/*
- * cpu_entry_area is a percpu region in the fixmap that contains things
- * needed by the CPU and early entry/exit code. Real types aren't used
- * for all fields here to avoid circular header dependencies.
- *
- * Every field is a virtual alias of some other allocated backing store.
- * There is no direct allocation of a struct cpu_entry_area.
- */
-struct cpu_entry_area {
- char gdt[PAGE_SIZE];
-
- /*
- * The GDT is just below entry_stack and thus serves (on x86_64) as
- * a a read-only guard page.
- */
- struct entry_stack_page entry_stack_page;
-
- /*
- * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
- * we need task switches to work, and task switches write to the TSS.
- */
- struct tss_struct tss;
-
- char entry_trampoline[PAGE_SIZE];
-
-#ifdef CONFIG_X86_64
- /*
- * Exception stacks used for IST entries.
- *
- * In the future, this should have a separate slot for each stack
- * with guard pages between them.
- */
- char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
-#endif
-#ifdef CONFIG_CPU_SUP_INTEL
- /*
- * Per CPU debug store for Intel performance monitoring. Wastes a
- * full page at the moment.
- */
- struct debug_store cpu_debug_store;
- /*
- * The actual PEBS/BTS buffers must be mapped to user space
- * Reserve enough fixmap PTEs.
- */
- struct debug_store_buffers cpu_debug_buffers;
-#endif
-};
-
-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
-
-extern void setup_cpu_entry_areas(void);

/*
* Here we define all the compile-time 'special' virtual
@@ -158,17 +106,7 @@ enum fixed_addresses {
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */

- /*
- * Fixmap entries to remap the IDT, and the per CPU entry areas.
- * Aligned to a PMD boundary.
- */
- FIX_USR_SHARED_TOP = round_up(FIX_TEXT_POKE0 + 1, PTRS_PER_PMD),
- FIX_RO_IDT,
- FIX_CPU_ENTRY_AREA_TOP,
- FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
- FIX_USR_SHARED_BOTTOM = round_up(FIX_CPU_ENTRY_AREA_BOTTOM + 2, PTRS_PER_PMD) - 1,
-
- __end_of_permanent_fixed_addresses = FIX_USR_SHARED_BOTTOM,
+ __end_of_permanent_fixed_addresses = FIX_TEXT_POKE0,

/*
* 512 temporary boot-time mappings, used by early_ioremap(),
@@ -249,30 +187,5 @@ void __init *early_memremap_decrypted_wp
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);

-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
-{
- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
- return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
-}
-
-#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
- BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
- __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
- })
-
-#define get_cpu_entry_area_index(cpu, field) \
- __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
-
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
- return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
-}
-
-static inline struct entry_stack *cpu_entry_stack(int cpu)
-{
- return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -483,133 +483,8 @@ static const unsigned int exception_stac
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif

-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page,
- entry_stack_storage);
-
-/*
- * Force the population of PMDs for not yet allocated per cpu
- * memory like debug store buffers.
- */
-static void __init allocate_percpu_fixmap_ptes(int idx, int pages)
-{
- for (; pages; pages--, idx--)
- __set_fixmap(idx, 0, PAGE_NONE);
-}
-
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
-{
- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
-}
-
-static void percpu_setup_debug_store(int cpu)
-{
-#ifdef CONFIG_CPU_SUP_INTEL
- int npages, idx;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return;
-
- idx = get_cpu_entry_area_index(cpu, cpu_debug_store);
- npages = sizeof(struct debug_store) / PAGE_SIZE;
- BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
- set_percpu_fixmap_pages(idx, &per_cpu(cpu_debug_store, cpu), npages,
- PAGE_KERNEL);
-
- idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers);
- npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
- allocate_percpu_fixmap_ptes(idx, npages);
-#endif
-}
-
-/* Setup the fixmap mappings only once per-processor */
-static void __init setup_cpu_entry_area(int cpu)
-{
-#ifdef CONFIG_X86_64
- extern char _entry_trampoline[];
-
- /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
- pgprot_t gdt_prot = PAGE_KERNEL_RO;
- pgprot_t tss_prot = PAGE_KERNEL_RO;
-#else
- /*
- * On native 32-bit systems, the GDT cannot be read-only because
- * our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the
- * GDT is read-only, that will triple fault. The TSS cannot be
- * read-only because the CPU writes to it on task switches.
- *
- * On Xen PV, the GDT must be read-only because the hypervisor
- * requires it.
- */
- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
- PAGE_KERNEL_RO : PAGE_KERNEL;
- pgprot_t tss_prot = PAGE_KERNEL;
-#endif
-
- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
- per_cpu_ptr(&entry_stack_storage, cpu), 1,
- PAGE_KERNEL);
-
- /*
- * The Intel SDM says (Volume 3, 7.2.1):
- *
- * Avoid placing a page boundary in the part of the TSS that the
- * processor reads during a task switch (the first 104 bytes). The
- * processor may not correctly perform address translations if a
- * boundary occurs in this area. During a task switch, the processor
- * reads and writes into the first 104 bytes of each TSS (using
- * contiguous physical addresses beginning with the physical address
- * of the first byte of the TSS). So, after TSS access begins, if
- * part of the 104 bytes is not physically contiguous, the processor
- * will access incorrect information without generating a page-fault
- * exception.
- *
- * There are also a lot of errata involving the TSS spanning a page
- * boundary. Assert that we're not doing that.
- */
- BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
- offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
- BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
- &per_cpu(cpu_tss_rw, cpu),
- sizeof(struct tss_struct) / PAGE_SIZE,
- tss_prot);
-
-#ifdef CONFIG_X86_32
- per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
-#endif
-
-#ifdef CONFIG_X86_64
- BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
- BUILD_BUG_ON(sizeof(exception_stacks) !=
- sizeof(((struct cpu_entry_area *)0)->exception_stacks));
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
- &per_cpu(exception_stacks, cpu),
- sizeof(exception_stacks) / PAGE_SIZE,
- PAGE_KERNEL);
-
- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
-#endif
- percpu_setup_debug_store(cpu);
-}
-
-void __init setup_cpu_entry_areas(void)
-{
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- setup_cpu_entry_area(cpu);
-}
-
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
{
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
+#include <asm/cpu_entry_area.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -950,8 +951,8 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
- idt_descr.address = fix_to_virt(FIX_RO_IDT);
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;

/*
* Should be a barrier for any external CPU state:
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif

obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o

# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,138 @@
+#include <linux/percpu.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+ unsigned long va = (unsigned long) cea_vaddr;
+
+ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+/*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+static void __init cea_allocate_ptes(void *cea_vaddr, int pages)
+{
+ for (; pages; pages--, cea_vaddr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, 0, PAGE_NONE);
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+ int npages;
+ void *cea;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ npages = sizeof(struct debug_store) / PAGE_SIZE;
+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+ PAGE_KERNEL);
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+ cea_allocate_ptes(cea, npages);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+ gdt_prot);
+
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+ per_cpu_ptr(&entry_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+ percpu_setup_debug_store(cpu);
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
+}
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,6 +3,7 @@
#include <linux/types.h>
#include <linux/slab.h>

+#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/insn.h>

@@ -280,16 +281,22 @@ void fini_debug_store_on_cpu(int cpu)

static DEFINE_PER_CPU(void *, insn_buffer);

-static u64 ds_update_fixmap(int idx, void *addr, size_t size, pgprot_t prot)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
- phys_addr_t pa, va;
+ phys_addr_t pa;
size_t msz = 0;

- va = __fix_to_virt(idx);
pa = virt_to_phys(addr);
- for (; msz < size; idx--, msz += PAGE_SIZE, pa += PAGE_SIZE)
- __set_fixmap(idx, pa, prot);
- return va;
+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+ size_t msz = 0;
+
+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
}

static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
@@ -313,8 +320,8 @@ static int alloc_pebs_buffer(int cpu)
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
size_t bsiz = x86_pmu.pebs_buffer_size;
- int idx, max, node = cpu_to_node(cpu);
- void *buffer, *ibuffer;
+ int max, node = cpu_to_node(cpu);
+ void *buffer, *ibuffer, *cea;

if (!x86_pmu.pebs)
return 0;
@@ -336,10 +343,10 @@ static int alloc_pebs_buffer(int cpu)
per_cpu(insn_buffer, cpu) = ibuffer;
}
hwev->ds_pebs_vaddr = buffer;
- /* Update the fixmap */
- idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
- ds->pebs_buffer_base = ds_update_fixmap(idx, buffer, bsiz,
- PAGE_KERNEL);
+ /* Update the cpu entry area mapping */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds->pebs_buffer_base = (u64) cea;
+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base;
max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
@@ -350,7 +357,7 @@ static void release_pebs_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
- int idx;
+ void *cea;

if (!ds || !x86_pmu.pebs)
return;
@@ -359,8 +366,8 @@ static void release_pebs_buffer(int cpu)
per_cpu(insn_buffer, cpu) = NULL;

/* Clear the fixmap */
- idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
- ds_update_fixmap(idx, 0, x86_pmu.pebs_buffer_size, PAGE_NONE);
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
@@ -370,8 +377,8 @@ static int alloc_bts_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
- int idx, max;
- void *buffer;
+ void *buffer, *cea;
+ int max;

if (!x86_pmu.bts)
return 0;
@@ -383,9 +390,9 @@ static int alloc_bts_buffer(int cpu)
}
hwev->ds_bts_vaddr = buffer;
/* Update the fixmap */
- idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
- ds->bts_buffer_base = ds_update_fixmap(idx, buffer, BTS_BUFFER_SIZE,
- PAGE_KERNEL);
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds->bts_buffer_base = (u64) cea;
+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
ds->bts_absolute_maximum = ds->bts_buffer_base + max;
@@ -397,14 +404,14 @@ static void release_bts_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
- int idx;
+ void *cea;

if (!ds || !x86_pmu.bts)
return;

/* Clear the fixmap */
- idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
- ds_update_fixmap(idx, 0, BTS_BUFFER_SIZE, PAGE_NONE);
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
hwev->ds_bts_vaddr = NULL;
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -7,6 +7,7 @@
#include <asm/mmu.h>
#include <asm/fixmap.h>
#include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>

#include <linux/smp.h>
#include <linux/percpu.h>
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -38,13 +38,13 @@ extern bool __vmalloc_start_set; /* set
#define LAST_PKMAP 1024
#endif

-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
- & PMD_MASK)
+#define PKMAP_BASE \
+ ((FIXMAP_START - PAGE_SIZE * (LAST_PKMAP + 1)) & PMD_MASK)

#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END (FIXMAP_START - 2 * PAGE_SIZE)
#endif

#define MODULES_VADDR VMALLOC_START
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/sysfs.h>

+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>

--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -62,6 +62,7 @@ enum address_markers_idx {
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
+ CPU_ENTRY_AREA_START_NR,
# ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
# endif
@@ -97,6 +98,7 @@ static struct addr_marker address_marker
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
{ LDT_BASE_ADDR, "LDT remap" },
#endif
+ { CPU_ENTRY_AREA_BASE, "CPU entry Area" },
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>

extern struct range pfn_mapped[E820_MAX_ENTRIES];

@@ -330,12 +331,13 @@ void __init kasan_init(void)
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));

- shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE);

- shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_TOT_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE);
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -275,21 +275,25 @@ pti_clone_pmds(unsigned long start, unsi
}
}

-static void __init pti_setup_espfix64(void)
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
{
-#ifdef CONFIG_X86_ESPFIX64
- /*
- * ESPFIX64 uses a single p4d (i.e. a top-level entry on 4-level
- * systems and a next-level entry on 5-level systems. Share that
- * entry between the user and kernel pagetables.
- */
- pgd_t *kernel_pgd;
p4d_t *kernel_p4d, *user_p4d;
+ pgd_t *kernel_pgd;

- user_p4d = pti_user_pagetable_walk_p4d(ESPFIX_BASE_ADDR);
- kernel_pgd = pgd_offset_k(ESPFIX_BASE_ADDR);
- kernel_p4d = p4d_offset(kernel_pgd, ESPFIX_BASE_ADDR);
+ user_p4d = pti_user_pagetable_walk_p4d(addr);
+ kernel_pgd = pgd_offset_k(addr);
+ kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
+}
+
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+ pti_clone_p4d(ESPFIX_BASE_ADDR);
#endif
}

@@ -313,20 +317,11 @@ static void __init pti_setup_vsyscall(vo
}

/*
- * Clone the populated PMDs of the user shared fixmaps into the user space
- * visible page table.
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
*/
static void __init pti_clone_user_shared(void)
{
- unsigned long bot, top;
-
- bot = __fix_to_virt(FIX_USR_SHARED_BOTTOM);
- top = __fix_to_virt(FIX_USR_SHARED_TOP) + PAGE_SIZE;
-
- /* Top of the user shared block must be PMD-aligned. */
- WARN_ON(top & ~PMD_MASK);
-
- pti_clone_pmds(bot, top, 0);
+ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}

/*
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx,

switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
- case FIX_RO_IDT:
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
# ifdef CONFIG_HIGHMEM
@@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx,
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
- case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;