[PATCH v3 44/75] x86/sev-es: Allocate and Map IST stacks for #VC handler

From: Joerg Roedel
Date: Tue Apr 28 2020 - 11:21:54 EST


From: Joerg Roedel <jroedel@xxxxxxx>

Allocate and map enough stacks for the #VC handler to support sufficient
levels of nesting and the NMI-in-#VC scenario.

Also setup the IST entrys for the #VC handler on all CPUs because #VC
needs to work before cpu_init() has set up the per-cpu TSS.

Signed-off-by: Joerg Roedel <jroedel@xxxxxxx>
---
arch/x86/include/asm/cpu_entry_area.h | 61 +++++++++++++++++++++++++++
arch/x86/include/asm/page_64_types.h | 1 +
arch/x86/kernel/cpu/common.c | 1 +
arch/x86/kernel/sev-es.c | 40 ++++++++++++++++++
4 files changed, 103 insertions(+)

diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 02c0078d3787..85aac6c63653 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -64,6 +64,61 @@ enum exception_stack_ordering {
#define CEA_ESTACK_PAGES \
(sizeof(struct cea_exception_stacks) / PAGE_SIZE)

+/*
+ * VC Handler IST Stacks
+ *
+ * The IST stacks for the #VC handler are only allocated when SEV-ES is active,
+ * so they are not part of 'struct exception_stacks'.
+ *
+ * The VC handler uses shift_ist so that #VC can be nested. Nesting happens for
+ * example when the #VC handler has to call printk in the case of and error or
+ * when emulating 'movs' instructions.
+ *
+ * NMIs are another special case which can cause nesting of #VC handlers. The
+ * do_nmi() code path can cause #VC, e.g. for RDPMC. An NMI can also hit in
+ * the time window when the #VC handler is raised but before it has shifted its
+ * IST entry. To make sure any #VC raised from the NMI code path uses a new
+ * stack, the NMI handler unconditionally shifts the #VC handlers IST entry.
+ * This can cause one IST stack for #VC to be omitted.
+ *
+ * To support sufficient levels of nesting for the #VC handler, make the number
+ * of nesting levels configurable. It is currently set to 5 to support this
+ * scenario:
+ *
+ * #VC - IST stack 4, IST entry already shifted to 3
+ *
+ * -> NMI - shifts #VC IST entry to 2
+ *
+ * -> #VC(RDPMC) - shifts #VC IST to 1, something goes wrong, print
+ * an error message
+ *
+ * -> #VC(printk) - shifts #VC IST entry to 0, output driver
+ * uses 'movs'
+ *
+ * -> #VC(movs) - shifts IST to unmapped stack, further #VCs will
+ * cause #DF
+ *
+ */
+#define N_VC_STACKS 5
+
+#define VC_STACK_MEMBERS(guardsize, holesize) \
+ char hole[holesize]; \
+ struct { \
+ char guard[guardsize]; \
+ char stack[EXCEPTION_STKSZ]; \
+ } stacks[N_VC_STACKS]; \
+ char top_guard[guardsize]; \
+
+/* Physical storage */
+struct vmm_exception_stacks {
+ VC_STACK_MEMBERS(0, 0)
+};
+
+/* Mapping in cpu_entry_area */
+struct cea_vmm_exception_stacks {
+ VC_STACK_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
+};
+
#endif

#ifdef CONFIG_X86_32
@@ -110,6 +165,12 @@ struct cpu_entry_area {
* Exception stacks used for IST entries with guard pages.
*/
struct cea_exception_stacks estacks;
+
+ /*
+ * IST Exception stacks for VC handler - Only allocated and mapped when
+ * SEV-ES is active.
+ */
+ struct cea_vmm_exception_stacks vc_stacks;
#endif
/*
* Per CPU debug store for Intel performance monitoring. Wastes a
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 288b065955b7..d0c6c10c18a0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -28,6 +28,7 @@
#define IST_INDEX_NMI 1
#define IST_INDEX_DB 2
#define IST_INDEX_MCE 3
+#define IST_INDEX_VC 4

/*
* Set __PAGE_OFFSET to the most negative possible address +
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index bed0cb83fe24..214765635e86 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1808,6 +1808,7 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* IST_INDEX_VC already set up for all CPUs during early boot */
}

#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index a43d80d5e50e..e5d87f2af357 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>

+#include <asm/cpu_entry_area.h>
#include <asm/trap_defs.h>
#include <asm/sev-es.h>
#include <asm/insn-eval.h>
@@ -37,6 +38,9 @@ static struct ghcb __initdata *boot_ghcb;
/* #VC handler runtime per-cpu data */
struct sev_es_runtime_data {
struct ghcb ghcb_page;
+
+ /* Physical storage for the per-cpu IST stacks of the #VC handler */
+ struct vmm_exception_stacks vc_stacks __aligned(PAGE_SIZE);
};

static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
@@ -236,11 +240,46 @@ static void __init sev_es_init_ghcb(int cpu)
memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
}

+static void __init sev_es_setup_vc_stack(int cpu)
+{
+ struct vmm_exception_stacks *stack;
+ struct sev_es_runtime_data *data;
+ struct cpu_entry_area *cea;
+ struct tss_struct *tss;
+ unsigned long size;
+ char *first_stack;
+ int i;
+
+ data = per_cpu(runtime_data, cpu);
+ stack = &data->vc_stacks;
+ cea = get_cpu_entry_area(cpu);
+
+ /* Map the stacks to the cpu_entry_area */
+ for (i = 0; i < N_VC_STACKS; i++) {
+ void *vaddr = cea->vc_stacks.stacks[i].stack;
+ phys_addr_t pa = __pa(stack->stacks[i].stack);
+
+ cea_set_pte(vaddr, pa, PAGE_KERNEL);
+ }
+
+ /*
+ * The #VC handler IST stack is needed in secondary CPU bringup before
+ * cpu_init() had a chance to setup the rest of the TSS. So setup the
+ * #VC handlers stack pointer up here for all CPUs
+ */
+ first_stack = cea->vc_stacks.stacks[N_VC_STACKS - 1].stack;
+ size = sizeof(cea->vc_stacks.stacks[N_VC_STACKS - 1].stack);
+ tss = per_cpu_ptr(&cpu_tss_rw, cpu);
+
+ tss->x86_tss.ist[IST_INDEX_VC] = (unsigned long)first_stack + size;
+}
+
void __init sev_es_init_vc_handling(void)
{
int cpu;

BUILD_BUG_ON((offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE) != 0);
+ BUILD_BUG_ON((offsetof(struct sev_es_runtime_data, vc_stacks) % PAGE_SIZE) != 0);

if (!sev_es_active())
return;
@@ -249,6 +288,7 @@ void __init sev_es_init_vc_handling(void)
for_each_possible_cpu(cpu) {
sev_es_alloc_runtime_data(cpu);
sev_es_init_ghcb(cpu);
+ sev_es_setup_vc_stack(cpu);
}
}

--
2.17.1