[PATCH 4/6] x86: efistub: Perform 4/5 level paging switch from the stub

From: Ard Biesheuvel
Date: Mon Apr 24 2023 - 12:58:11 EST


In preparation for updating the EFI stub boot flow to avoid the bare
metal decompressor code altogether, implement the support code for
switching between 4 and 5 levels of paging before jumping to the kernel
proper.

Signed-off-by: Ard Biesheuvel <ardb@xxxxxxxxxx>
---
drivers/firmware/efi/libstub/efi-stub-helper.c | 4 +
drivers/firmware/efi/libstub/x86-stub.c | 145 ++++++++++++++++++++
2 files changed, 149 insertions(+)

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 1e0203d74691ffcc..fc5f3b4c45e91401 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -16,6 +16,8 @@

#include "efistub.h"

+extern bool efi_no5lvl;
+
bool efi_nochunk;
bool efi_nokaslr = !IS_ENABLED(CONFIG_RANDOMIZE_BASE);
bool efi_novamap;
@@ -73,6 +75,8 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
} else if (!strcmp(param, "noinitrd")) {
efi_noinitrd = true;
+ } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
+ efi_no5lvl = true;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index e136c94037dda8d3..7b8717cbb96a1246 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -760,6 +760,139 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return EFI_SUCCESS;
}

+#ifdef CONFIG_X86_64
+bool efi_no5lvl;
+
+static const struct desc_struct gdt[] = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+};
+
+static void (*la57_toggle)(void *cr3, void *gdt);
+
+static void __naked tmpl_toggle(void *cr3, void *gdt)
+{
+ /*
+ * This is template code that will be copied into a 32-bit addressable
+ * buffer, allowing us to drop to 32-bit mode with paging disabled,
+ * which is required to be able to toggle the CR4.LA57 bit.
+ *
+ * The first MOVB instruction is only there to capture the size of the
+ * sequence, and implicitly, the offset to the LJMP's immediate, which
+ * will be populated with the correct absolute address after copying.
+ */
+ asm("0: movb $(4f - .), %%al \n\t"
+ " lgdt (%%rsi) \n\t"
+ " movw %[ds], %%ax \n\t"
+ " movw %%ax, %%ds \n\t"
+ " movw %%ax, %%ss \n\t"
+ " leaq 2f(%%rip), %%rax \n\t"
+ " pushq %[cs32] \n\t"
+ " pushq %%rax \n\t"
+ " lretq \n\t"
+ "1: retq \n\t"
+ " .code32 \n\t"
+ "2: movl %%cr0, %%eax \n\t"
+ " btrl %[pg], %%eax \n\t"
+ " movl %%eax, %%cr0 \n\t"
+ " jmp 3f \n\t"
+ "3: movl %%cr4, %%ecx \n\t"
+ " btcl %[la57], %%ecx \n\t"
+ " movl %%ecx, %%cr4 \n\t"
+ " movl %%edi, %%cr3 \n\t"
+ " btsl %[pg], %%eax \n\t"
+ " movl %%eax, %%cr0 \n\t"
+ " ljmpl %[cs], $(1b - 0b) \n\t"
+ "4: .code64"
+ :
+ : [cs32] "i"(__KERNEL32_CS),
+ [cs] "i"(__KERNEL_CS),
+ [ds] "i"(__KERNEL_DS),
+ [pg] "i"(X86_CR0_PG_BIT),
+ [la57] "i"(X86_CR4_LA57_BIT));
+}
+
+/*
+ * Enabling (or disabling) 5 level paging is tricky, because it can only be
+ * done from 32-bit mode with paging disabled. This means not only that the
+ * code itself must be running from 32-bit addressable physical memory, but
+ * also that the root page table must be 32-bit addressable, as we cannot
+ * program a 64-bit value into CR3 when running in 32-bit mode.
+ */
+static efi_status_t efi_setup_5level_paging(void)
+{
+ const u8 tmpl_size = ((u8 *)tmpl_toggle)[1];
+ efi_status_t status;
+ u8 *la57_code;
+
+ if (!efi_is_64bit())
+ return EFI_SUCCESS;
+
+ /* check for 5 level paging support */
+ if (native_cpuid_eax(0) < 7 ||
+ !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return EFI_SUCCESS;
+
+ /* allocate some 32-bit addressable memory for code and a page table */
+ status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
+ U32_MAX);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ la57_toggle = memcpy(la57_code, tmpl_toggle, tmpl_size);
+ memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
+
+ /*
+ * To avoid having to allocate a 32-bit addressable stack, we use a
+ * ljmp to switch back to long mode. However, this takes an absolute
+ * address, so we have to poke it in at runtime. The dummy MOVB
+ * instruction at the beginning can be used to locate the immediate.
+ */
+ *(u32 *)&la57_code[tmpl_size - 6] += (unsigned long)la57_code;
+
+ adjust_memory_range_protection((unsigned long)la57_code, PAGE_SIZE);
+
+ return EFI_SUCCESS;
+}
+
+static void efi_5level_switch(void)
+{
+ bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
+ bool have_la57 = native_read_cr4() & X86_CR4_LA57;
+ bool need_toggle = want_la57 ^ have_la57;
+ u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
+ u64 *cr3 = (u64 *)__native_read_cr3();
+ struct desc_ptr desc;
+ u64 *new_cr3;
+
+ if (!la57_toggle || !need_toggle)
+ return;
+
+ if (!have_la57) {
+ /*
+ * We are going to enable 5 level paging, so we need to
+ * allocate a root level page from the 32-bit addressable
+ * physical region, and plug the existing hierarchy into it.
+ */
+ new_cr3 = memset(pgt, 0, PAGE_SIZE);
+ new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
+ } else {
+ // take the new root table pointer from the current entry #0
+ new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
+
+ // copy the new root level table if it is not 32-bit addressable
+ if ((u64)new_cr3 > U32_MAX)
+ new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
+ }
+
+ desc.size = sizeof(gdt) - 1;
+ desc.address = (u64)gdt;
+
+ la57_toggle(new_cr3, &desc);
+}
+#endif
+
/*
* On success, we return the address of startup_32, which has potentially been
* relocated by efi_relocate_kernel.
@@ -792,6 +925,14 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
(get_efi_config_table(ACPI_20_TABLE_GUID) ?:
get_efi_config_table(ACPI_TABLE_GUID));

+#ifdef CONFIG_X86_64
+ status = efi_setup_5level_paging();
+ if (status != EFI_SUCCESS) {
+ efi_err("efi_setup_5level_paging() failed!\n");
+ goto fail;
+ }
+#endif
+
/*
* If the kernel isn't already loaded at a suitable address,
* relocate it.
@@ -910,6 +1051,10 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
goto fail;
}

+#ifdef CONFIG_X86_64
+ efi_5level_switch();
+#endif
+
return bzimage_addr;
fail:
efi_err("efi_main() failed!\n");
--
2.39.2