[PATCH RFC 42/43] x86/pie: Allow kernel image to be relocated in top 512G

From: Hou Wenlong
Date: Fri Apr 28 2023 - 05:56:43 EST


For PIE kernel image, it could be relocated at any address. To be
simplified, treat the 2G area which including kernel image, modules area
and fixmap area as a whole, allow it to be relocated in top 512G. After
that, the relocated kernel address may be below than __START_KERNEL_map,
so use a global variable to store the base of relocated kernel image.
And pa/va transformation of kernel image address is adapted.

Suggested-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
Cc: Thomas Garnier <thgarnie@xxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
---
arch/x86/include/asm/kmsan.h | 6 ++---
arch/x86/include/asm/page_64.h | 8 +++----
arch/x86/include/asm/page_64_types.h | 8 +++++++
arch/x86/include/asm/pgtable_64_types.h | 10 ++++----
arch/x86/kernel/head64.c | 32 ++++++++++++++++++-------
arch/x86/kernel/head_64.S | 12 ++++++++++
arch/x86/kernel/setup.c | 6 +++++
arch/x86/mm/dump_pagetables.c | 9 ++++---
arch/x86/mm/init_64.c | 8 +++----
arch/x86/mm/kasan_init_64.c | 4 ++--
arch/x86/mm/pat/set_memory.c | 2 +-
arch/x86/mm/physaddr.c | 14 +++++------
arch/x86/platform/efi/efi_thunk_64.S | 4 ++++
13 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h
index 8fa6ac0e2d76..a635d825342d 100644
--- a/arch/x86/include/asm/kmsan.h
+++ b/arch/x86/include/asm/kmsan.h
@@ -63,16 +63,16 @@ static inline bool kmsan_phys_addr_valid(unsigned long addr)
static inline bool kmsan_virt_addr_valid(void *addr)
{
unsigned long x = (unsigned long)addr;
- unsigned long y = x - __START_KERNEL_map;
+ unsigned long y = x - KERNEL_MAP_BASE;

- /* use the carry flag to determine if x was < __START_KERNEL_map */
+ /* use the carry flag to determine if x was < KERNEL_MAP_BASE */
if (unlikely(x > y)) {
x = y + phys_base;

if (y >= KERNEL_IMAGE_SIZE)
return false;
} else {
- x = y + (__START_KERNEL_map - PAGE_OFFSET);
+ x = y + (KERNEL_MAP_BASE - PAGE_OFFSET);

/* carry flag will be set if starting x was >= PAGE_OFFSET */
if ((x > y) || !kmsan_phys_addr_valid(x))
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index cc6b8e087192..b8692e6cc939 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -20,10 +20,10 @@ extern unsigned long vmemmap_base;

static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
- unsigned long y = x - __START_KERNEL_map;
+ unsigned long y = x - KERNEL_MAP_BASE;

- /* use the carry flag to determine if x was < __START_KERNEL_map */
- x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
+ /* use the carry flag to determine if x was < KERNEL_MAP_BASE */
+ x = y + ((x > y) ? phys_base : (KERNEL_MAP_BASE - PAGE_OFFSET));

return x;
}
@@ -34,7 +34,7 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#define __phys_addr_symbol(x) \
- ((unsigned long)(x) - __START_KERNEL_map + phys_base)
+ ((unsigned long)(x) - KERNEL_MAP_BASE + phys_base)
#endif

#define __phys_reloc_hide(x) (x)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index e9e2c3ba5923..933d37845064 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -4,6 +4,8 @@

#ifndef __ASSEMBLY__
#include <asm/kaslr.h>
+
+extern unsigned long kernel_map_base;
#endif

#ifdef CONFIG_KASAN
@@ -49,6 +51,12 @@

#define __START_KERNEL_map _AC(0xffffffff80000000, UL)

+#ifdef CONFIG_X86_PIE
+#define KERNEL_MAP_BASE kernel_map_base
+#else
+#define KERNEL_MAP_BASE __START_KERNEL_map
+#endif /* CONFIG_X86_PIE */
+
/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */

#define __PHYSICAL_MASK_SHIFT 52
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 38bf837e3554..3d6951128a07 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -187,14 +187,16 @@ extern unsigned int ptrs_per_p4d;
#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN)
#endif /* CONFIG_KMSAN */

-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+#define RAW_MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+#define MODULES_VADDR (KERNEL_MAP_BASE + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
-# define MODULES_END _AC(0xffffffffff000000, UL)
+# define RAW_MODULES_END _AC(0xffffffffff000000, UL)
#else
-# define MODULES_END _AC(0xfffffffffe000000, UL)
+# define RAW_MODULES_END _AC(0xfffffffffe000000, UL)
#endif
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define MODULES_LEN (RAW_MODULES_END - RAW_MODULES_VADDR)
+#define MODULES_END (MODULES_VADDR + MODULES_LEN)

#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c5cd61aab8ae..234ac796863a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -66,6 +66,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif

+#ifdef CONFIG_X86_PIE
+unsigned long kernel_map_base __ro_after_init = __START_KERNEL_map;
+EXPORT_SYMBOL(kernel_map_base);
+#endif
+
/*
* GDT used on the boot CPU before switching to virtual addresses.
*/
@@ -193,6 +198,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
{
unsigned long load_delta, *p;
unsigned long pgtable_flags;
+ unsigned long kernel_map_base_offset = 0;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
@@ -252,6 +258,13 @@ unsigned long __head __startup_64(unsigned long physaddr,
pud[511] += load_delta;
}

+#ifdef CONFIG_X86_PIE
+ kernel_map_base_offset = text_base & PUD_MASK;
+ *fixup_long(&kernel_map_base, physaddr) = kernel_map_base_offset;
+ kernel_map_base_offset -= __START_KERNEL_map;
+ *fixup_long(&__FIXADDR_TOP, physaddr) += kernel_map_base_offset;
+#endif
+
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
pmd[i] += load_delta;
@@ -328,7 +341,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* fixup pages that are part of the kernel image */
for (; i <= pmd_index(end_base); i++)
if (pmd[i] & _PAGE_PRESENT)
- pmd[i] += load_delta;
+ pmd[i] += load_delta + kernel_map_base_offset;

/* invalidate pages after the kernel image */
for (; i < PTRS_PER_PMD; i++)
@@ -338,7 +351,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
- *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
+ *fixup_long(&phys_base, physaddr) += load_delta + kernel_map_base_offset -
+ sme_get_me_mask();

return sme_postprocess_startup(bp, pmd);
}
@@ -376,7 +390,7 @@ bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
if (!pgtable_l5_enabled())
p4d_p = pgd_p;
else if (pgd)
- p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + KERNEL_MAP_BASE - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -385,13 +399,13 @@ bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)

p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
- *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *pgd_p = (pgdval_t)p4d_p - KERNEL_MAP_BASE + phys_base + _KERNPG_TABLE;
}
p4d_p += p4d_index(address);
p4d = *p4d_p;

if (p4d)
- pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + KERNEL_MAP_BASE - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -400,13 +414,13 @@ bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)

pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *p4d_p = (p4dval_t)pud_p - KERNEL_MAP_BASE + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;

if (pud)
- pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + KERNEL_MAP_BASE - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -415,7 +429,7 @@ bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)

pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
- *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *pud_p = (pudval_t)pmd_p - KERNEL_MAP_BASE + phys_base + _KERNPG_TABLE;
}
pmd_p[pmd_index(address)] = pmd;

@@ -497,6 +511,7 @@ static void __init copy_bootdata(char *real_mode_data)

asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data)
{
+#ifndef CONFIG_X86_PIE
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
@@ -509,6 +524,7 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
+#endif

cr4_init_shadow();

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 19cb2852238b..feb14304d1ed 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -130,7 +130,13 @@ SYM_CODE_START_NOALIGN(startup_64)
popq %rsi

/* Form the CR3 value being sure to include the CR3 modifier */
+#ifdef CONFIG_X86_PIE
+ movq kernel_map_base(%rip), %rdi
+ movabs $early_top_pgt, %rcx
+ subq %rdi, %rcx
+#else
movabs $(early_top_pgt - __START_KERNEL_map), %rcx
+#endif
addq %rcx, %rax
jmp 1f
SYM_CODE_END(startup_64)
@@ -179,7 +185,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
#endif

/* Form the CR3 value being sure to include the CR3 modifier */
+#ifdef CONFIG_X86_PIE
+ movq kernel_map_base(%rip), %rdi
+ movabs $init_top_pgt, %rcx
+ subq %rdi, %rcx
+#else
movabs $(init_top_pgt - __START_KERNEL_map), %rcx
+#endif
addq %rcx, %rax
1:

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 16babff771bd..e68ca78b829c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -808,11 +808,17 @@ static int
dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
if (kaslr_enabled()) {
+#ifdef CONFIG_X86_PIE
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+ kaslr_offset(),
+ __START_KERNEL);
+#else
pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
kaslr_offset(),
__START_KERNEL,
__START_KERNEL_map,
MODULES_VADDR-1);
+#endif
} else {
pr_emerg("Kernel Offset: disabled\n");
}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 81aa1c0b39cc..d5c6f61242aa 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -102,9 +102,9 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_EFI
[EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
#endif
- [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
- [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
- [MODULES_END_NR] = { MODULES_END, "End Modules" },
+ [HIGH_KERNEL_NR] = { 0UL, "High Kernel Mapping" },
+ [MODULES_VADDR_NR] = { 0UL, "Modules" },
+ [MODULES_END_NR] = { 0UL, "End Modules" },
[FIXADDR_START_NR] = { 0UL, "Fixmap Area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
@@ -475,6 +475,9 @@ static int __init pt_dump_init(void)
address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
#endif
+ address_markers[HIGH_KERNEL_NR].start_address = KERNEL_MAP_BASE;
+ address_markers[MODULES_VADDR_NR].start_address = MODULES_VADDR;
+ address_markers[MODULES_END_NR].start_address = MODULES_END;
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
#ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b7fd05a1ba1d..54bcd46c229d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -413,7 +413,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
/*
* The head.S code sets up the kernel high mapping:
*
- * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ * from KERNEL_MAP_BASE to KERNEL_MAP_BASE + size (== _end-_text)
*
* phys_base holds the negative offset to the kernel, which is added
* to the compile time generated pmds. This results in invalid pmds up
@@ -425,8 +425,8 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
*/
void __init cleanup_highmap(void)
{
- unsigned long vaddr = __START_KERNEL_map;
- unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
+ unsigned long vaddr = KERNEL_MAP_BASE;
+ unsigned long vaddr_end = KERNEL_MAP_BASE + KERNEL_IMAGE_SIZE;
unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;

@@ -436,7 +436,7 @@ void __init cleanup_highmap(void)
* arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
*/
if (max_pfn_mapped)
- vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+ vaddr_end = KERNEL_MAP_BASE + (max_pfn_mapped << PAGE_SHIFT);

for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0302491d799d..0edc8fdfb419 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -197,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
return (p4d_t *)pgd;

p4d = pgd_val(*pgd) & PTE_PFN_MASK;
- p4d += __START_KERNEL_map - phys_base;
+ p4d += KERNEL_MAP_BASE - phys_base;
return (p4d_t *)p4d + p4d_index(addr);
}

@@ -420,7 +420,7 @@ void __init kasan_init(void)
shadow_cea_per_cpu_begin, 0);

kasan_populate_early_shadow((void *)shadow_cea_end,
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ kasan_mem_to_shadow((void *)KERNEL_MAP_BASE));

kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c434aea9939c..2fb89be3a750 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1709,7 +1709,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
__cpa_pfn_in_highmap(cpa->pfn)) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
- __START_KERNEL_map - phys_base;
+ KERNEL_MAP_BASE - phys_base;
alias_cpa = *cpa;
alias_cpa.vaddr = &temp_cpa_vaddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index fc3f3d3e2ef2..9cb6d898329c 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -14,15 +14,15 @@
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- unsigned long y = x - __START_KERNEL_map;
+ unsigned long y = x - KERNEL_MAP_BASE;

- /* use the carry flag to determine if x was < __START_KERNEL_map */
+ /* use the carry flag to determine if x was < KERNEL_MAP_BASE */
if (unlikely(x > y)) {
x = y + phys_base;

VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- x = y + (__START_KERNEL_map - PAGE_OFFSET);
+ x = y + (KERNEL_MAP_BASE - PAGE_OFFSET);

/* carry flag will be set if starting x was >= PAGE_OFFSET */
VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
- unsigned long y = x - __START_KERNEL_map;
+ unsigned long y = x - KERNEL_MAP_BASE;

/* only check upper bounds since lower bounds will trigger carry */
VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
@@ -46,16 +46,16 @@ EXPORT_SYMBOL(__phys_addr_symbol);

bool __virt_addr_valid(unsigned long x)
{
- unsigned long y = x - __START_KERNEL_map;
+ unsigned long y = x - KERNEL_MAP_BASE;

- /* use the carry flag to determine if x was < __START_KERNEL_map */
+ /* use the carry flag to determine if x was < KERNEL_MAP_BASE */
if (unlikely(x > y)) {
x = y + phys_base;

if (y >= KERNEL_IMAGE_SIZE)
return false;
} else {
- x = y + (__START_KERNEL_map - PAGE_OFFSET);
+ x = y + (KERNEL_MAP_BASE - PAGE_OFFSET);

/* carry flag will be set if starting x was >= PAGE_OFFSET */
if ((x > y) || !phys_addr_valid(x))
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index c4b1144f99f6..0997363821e7 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -52,7 +52,11 @@ STACK_FRAME_NON_STANDARD __efi64_thunk
/*
* Calculate the physical address of the kernel text.
*/
+#ifdef CONFIG_X86_PIE
+ movq kernel_map_base(%rip), %rax
+#else
movq $__START_KERNEL_map, %rax
+#endif
subq phys_base(%rip), %rax

leaq 1f(%rip), %rbp
--
2.31.1