Re: [PATCHv2 04/29] x86/traps: Add #VE support for TDX guest

From: Thomas Gleixner
Date: Tue Feb 01 2022 - 16:03:06 EST


On Mon, Jan 24 2022 at 18:01, Kirill A. Shutemov wrote:
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index df0fa695bb09..1da074123c16 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -68,6 +68,9 @@ static const __initconst struct idt_data early_idts[] = {
> */
> INTG(X86_TRAP_PF, asm_exc_page_fault),
> #endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> + INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
> +#endif
>
> +bool tdx_get_ve_info(struct ve_info *ve)
> +{
> + struct tdx_module_output out;
> +
> + /*
> + * NMIs and machine checks are suppressed. Before this point any
> + * #VE is fatal. After this point (TDGETVEINFO call), NMIs and
> + * additional #VEs are permitted (but it is expected not to
> + * happen unless kernel panics).

I really do not understand that comment. #NMI and #MC are suppressed
according to the above. How long are they suppressed and what's the
mechanism? Are they unblocked on return from __tdx_module_call() ?

What prevents a nested #VE? If it happens what makes it fatal? Is it
converted to a #DF or detected by software?

Also I do not understand that the last sentence tries to tell me. If the
suppression of #NMI and #MC is lifted on return from tdcall then both
can be delivered immediately afterwards, right?

I assume the additional #VE is triggered by software or a bug in the
kernel.

Confused.

> + */
> + if (__tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out))
> + return false;
> +
> + ve->exit_reason = out.rcx;
> + ve->exit_qual = out.rdx;
> + ve->gla = out.r8;
> + ve->gpa = out.r9;
> + ve->instr_len = lower_32_bits(out.r10);
> + ve->instr_info = upper_32_bits(out.r10);
> +
> + return true;
> +}
> +
> +/*
> + * Handle the user initiated #VE.
> + *
> + * For example, executing the CPUID instruction from user space
> + * is a valid case and hence the resulting #VE has to be handled.
> + *
> + * For dis-allowed or invalid #VE just return failure.
> + */
> +static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +/* Handle the kernel #VE */
> +static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
> +{
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return false;
> +}
> +
> +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
> +{
> + bool ret;
> +
> + if (user_mode(regs))
> + ret = tdx_virt_exception_user(regs, ve);
> + else
> + ret = tdx_virt_exception_kernel(regs, ve);
> +
> + /* After successful #VE handling, move the IP */
> + if (ret)
> + regs->ip += ve->instr_len;
> +
> + return ret;
> +}
> +
> bool is_tdx_guest(void)
> {
> return tdx_guest_detected;
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index c9d566dcf89a..428504535912 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/vdso.h>
> +#include <asm/tdx.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -1212,6 +1213,115 @@ DEFINE_IDTENTRY(exc_device_not_available)
> }
> }
>
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +
> +#define VE_FAULT_STR "VE fault"
> +
> +static void ve_raise_fault(struct pt_regs *regs, long error_code)
> +{
> + struct task_struct *tsk = current;
> +
> + if (user_mode(regs)) {
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> + show_signal(tsk, SIGSEGV, "", VE_FAULT_STR, regs, error_code);
> + force_sig(SIGSEGV);
> + return;
> + }
> +
> + /*
> + * Attempt to recover from #VE exception failure without
> + * triggering OOPS (useful for MSR read/write failures)
> + */
> + if (fixup_exception(regs, X86_TRAP_VE, error_code, 0))
> + return;
> +
> + tsk->thread.error_code = error_code;
> + tsk->thread.trap_nr = X86_TRAP_VE;
> +
> + /*
> + * To be potentially processing a kprobe fault and to trust the result
> + * from kprobe_running(), it should be non-preemptible.
> + */
> + if (!preemptible() && kprobe_running() &&
> + kprobe_fault_handler(regs, X86_TRAP_VE))
> + return;
> +
> + /* Notify about #VE handling failure, useful for debugger hooks */
> + if (notify_die(DIE_GPF, VE_FAULT_STR, regs, error_code,
> + X86_TRAP_VE, SIGSEGV) == NOTIFY_STOP)
> + return;
> +
> + /* Trigger OOPS and panic */
> + die_addr(VE_FAULT_STR, regs, error_code, 0);

This is pretty much a copy of the #GP handling. So why not consolidating
this properly?

--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -559,6 +559,36 @@ static bool fixup_iopl_exception(struct
return true;
}

+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ ret = notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV);
+ return ret == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, long error_code,
+ const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -587,34 +617,14 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr
if (fixup_iopl_exception(regs))
goto exit;

- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;

- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}

- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;

if (error_code)

which makes this:

static void ve_raise_fault(struct pt_regs *regs, long error_code)
{
if (user_mode(regs)) {
gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
return;
}

if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)
return;

die_addr(VE_FAULT_STR, regs, error_code, 0);
}

Hmm?

> +/*
> + * Virtualization Exceptions (#VE) are delivered to TDX guests due to
> + * specific guest actions which may happen in either user space or the
> + * kernel:
> + *
> + * * Specific instructions (WBINVD, for example)
> + * * Specific MSR accesses
> + * * Specific CPUID leaf accesses
> + * * Access to unmapped pages (EPT violation)
> + *
> + * In the settings that Linux will run in, virtualization exceptions are
> + * never generated on accesses to normal, TD-private memory that has been
> + * accepted.
> + *
> + * Syscall entry code has a critical window where the kernel stack is not
> + * yet set up. Any exception in this window leads to hard to debug issues
> + * and can be exploited for privilege escalation. Exceptions in the NMI
> + * entry code also cause issues. Returning from the exception handler with
> + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
> + *
> + * For these reasons, the kernel avoids #VEs during the syscall gap and
> + * the NMI entry code. Entry code paths do not access TD-shared memory,
> + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
> + * that might generate #VE.

How is that enforced or validated? What checks for a violation of that
assumption?

Thanks,

tglx