[PATCH] LDT improvements

From: Andy Lutomirski
Date: Thu Dec 07 2017 - 02:22:30 EST


I think I like this approach. I also think it might be nice to move the
whole cpu_entry_area into this new pgd range so that we can stop mucking
around with the fixmap.

TODO:
- It crashes in ldt_gdt_64. Not sure why.
- 5-level docs aren't updated and the code is untested.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
Documentation/x86/x86_64/mm.txt | 11 ++--
arch/x86/include/asm/mmu_context.h | 33 +++++++++-
arch/x86/include/asm/pgtable_64_types.h | 2 +
arch/x86/include/asm/processor.h | 23 +++++--
arch/x86/kernel/ldt.c | 109 +++++++++++++++++++++++++++++++-
5 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 2d7d6590ade8..bfa44e1cb293 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

Virtual memory map with 5 level page tables:
@@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

Architecture defines a 64-bit virtual address. Implementations can support
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..eb87bbeddacc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,29 @@ struct ldt_struct {
*/
struct desc_struct *entries;
unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot).
+ */
+ int slot;
};

+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
/*
* Used for LDT copy/destruction.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk,
struct mm_struct *mm)
@@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/

- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ clear_LDT();
+ return;
+ }
+
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..130f575f8d1e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)

#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..a0008fb26ba2 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>

#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -46,13 +47,12 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;

if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;

- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ __flush_tlb_all();
+ load_mm_ldt(mm);

refresh_ldt_segments();
}
@@ -90,9 +90,93 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
}

new_ldt->nr_entries = num_entries;
+ new_ldt->slot = -1;
return new_ldt;
}

+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool is_vmalloc;
+ int i;
+ bool awful_hack;
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return 0;
+
+ WARN_ON(ldt->slot != -1);
+
+ /* Both LDT slots are contained in a single PMD. */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ awful_hack = pgd_none(*pgd);
+
+ p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = pud_alloc(mm, p4d, LDT_BASE_ADDR);
+ if (!pud)
+ return -ENOMEM;
+ pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR);
+ if (!pmd)
+ return -ENOMEM;
+ if (pte_alloc(mm, pmd, LDT_BASE_ADDR))
+ return -ENOMEM;
+
+ if (true) {
+ /* awful hack -- pti_set_user_pgd is a mess */
+ /* we can't even use the bool awful_hack because of pti_init_all_pgds(), which poops on our pgd. */
+ kernel_to_user_pgdp(pgd)->pgd = pgd->pgd;
+ }
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+
+ pte = pte_offset_kernel(pmd, va);
+ set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+ }
+
+ flush_tlb_mm_range(mm,
+ (unsigned long)ldt_slot_va(slot),
+ (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+ 0);
+
+ ldt->slot = slot;
+
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_64
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@@ -155,8 +239,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval)
+ goto out_free;

mm->context.ldt = new_ldt;
+ goto out_unlock;
+
+out_free:
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ return retval;

out_unlock:
mutex_unlock(&old_mm->context.lock);
@@ -174,6 +267,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}

+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@@ -285,6 +383,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)

new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ free_ldt_struct(old_ldt);
+ goto out_unlock;
+ }

install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
--
2.13.6