Re: [PATCH v2 10/11] KVM: SVM: implement support for vNMI

From: Santosh Shukla
Date: Mon Dec 05 2022 - 12:07:38 EST


On 11/30/2022 1:07 AM, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> feature.
>
> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.
>
> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.
>
> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.
>
> Signed-off-by: Santosh Shukla <santosh.shukla@xxxxxxx>
> Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
> ---
> arch/x86/kvm/svm/nested.c | 42 +++++++++++++++
> arch/x86/kvm/svm/svm.c | 111 ++++++++++++++++++++++++++++++--------
> arch/x86/kvm/svm/svm.h | 10 ++++
> 3 files changed, 140 insertions(+), 23 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
> return type == SVM_EVTINJ_TYPE_NMI;
> }
>
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> +{
> + struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> + /*
> + * Copy the vNMI state back to software NMI tracking state
> + * for the duration of the nested run
> + */
> +
nits - Extra line.

> + svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> + svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> + struct kvm_vcpu *vcpu = &svm->vcpu;
> + struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> + /*
> + * Restore the vNMI state from the software NMI tracking state
> + * after a nested run
> + */
> +
Ditto...

Thanks,
Santosh
> + if (svm->nmi_masked)
> + vmcb01->control.int_ctl |= V_NMI_MASK;
> + else
> + vmcb01->control.int_ctl &= ~V_NMI_MASK;
> +
> + if (vcpu->arch.nmi_pending) {
> + vcpu->arch.nmi_pending--;
> + vmcb01->control.int_ctl |= V_NMI_PENDING;
> + } else
> + vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> +}
> +
> +
> static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> unsigned long vmcb12_rip,
> unsigned long vmcb12_csbase)
> @@ -646,6 +682,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> else
> int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
>
> + if (vnmi)
> + nested_svm_save_vnmi(svm);
> +
> /* Copied from vmcb01. msrpm_base can be overwritten later. */
> vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
> vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
> @@ -1049,6 +1088,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> svm_update_lbrv(vcpu);
> }
>
> + if (vnmi)
> + nested_svm_restore_vnmi(svm);
> +
> /*
> * On vmexit the GIF is set to false and
> * no event can be injected in L1.
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cfed6ab29c839a..bf10adcf3170a8 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -230,6 +230,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
> bool intercept_smi = true;
> module_param(intercept_smi, bool, 0444);
>
> +bool vnmi = true;
> +module_param(vnmi, bool, 0444);
>
> static bool svm_gp_erratum_intercept = true;
>
> @@ -1299,6 +1301,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
> if (kvm_vcpu_apicv_active(vcpu))
> avic_init_vmcb(svm, vmcb);
>
> + if (vnmi)
> + svm->vmcb->control.int_ctl |= V_NMI_ENABLE;
> +
> if (vgif) {
> svm_clr_intercept(svm, INTERCEPT_STGI);
> svm_clr_intercept(svm, INTERCEPT_CLGI);
> @@ -3487,6 +3492,39 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
> ++vcpu->stat.nmi_injections;
> }
>
> +
> +static bool svm_get_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (!is_vnmi_enabled(svm))
> + return false;
> +
> + return !!(svm->vmcb->control.int_ctl & V_NMI_MASK);
> +}
> +
> +static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (!is_vnmi_enabled(svm))
> + return false;
> +
> + if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
> + return false;
> +
> + svm->vmcb->control.int_ctl |= V_NMI_PENDING;
> + vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
> +
> + /*
> + * NMI isn't yet technically injected but
> + * this rough estimation should be good enough
> + */
> + ++vcpu->stat.nmi_injections;
> +
> + return true;
> +}
> +
> static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3582,11 +3620,38 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
> svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
> }
>
> +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (is_vnmi_enabled(svm))
> + return svm->vmcb->control.int_ctl & V_NMI_MASK;
> + else
> + return svm->nmi_masked;
> +}
> +
> +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + if (is_vnmi_enabled(svm)) {
> + if (masked)
> + svm->vmcb->control.int_ctl |= V_NMI_MASK;
> + else
> + svm->vmcb->control.int_ctl &= ~V_NMI_MASK;
> + } else {
> + svm->nmi_masked = masked;
> + if (masked)
> + svm_enable_iret_interception(svm);
> + else
> + svm_disable_iret_interception(svm);
> + }
> +}
> +
> bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> struct vmcb *vmcb = svm->vmcb;
> - bool ret;
>
> if (!gif_set(svm))
> return true;
> @@ -3594,10 +3659,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
> if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
> return false;
>
> - ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
> - (svm->nmi_masked);
> + if (svm_get_nmi_mask(vcpu))
> + return true;
>
> - return ret;
> + return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
> }
>
> static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> @@ -3615,24 +3680,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> return 1;
> }
>
> -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> -{
> - return to_svm(vcpu)->nmi_masked;
> -}
> -
> -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> -{
> - struct vcpu_svm *svm = to_svm(vcpu);
> -
> - if (masked) {
> - svm->nmi_masked = true;
> - svm_enable_iret_interception(svm);
> - } else {
> - svm->nmi_masked = false;
> - svm_disable_iret_interception(svm);
> - }
> -}
> -
> bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
> /*
> * Something prevents NMI from been injected. Single step over possible
> * problem (IRET or exception injection or interrupt shadow)
> + *
> + * With vNMI we should never need an NMI window
> + * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
> */
> + if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> + return;
> +
> svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
> - svm->nmi_singlestep = true;
> svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
> + svm->nmi_singlestep = true;
> }
>
> static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
> @@ -4770,6 +4823,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> .patch_hypercall = svm_patch_hypercall,
> .inject_irq = svm_inject_irq,
> .inject_nmi = svm_inject_nmi,
> + .get_hw_nmi_pending = svm_get_hw_nmi_pending,
> + .set_hw_nmi_pending = svm_set_hw_nmi_pending,
> .inject_exception = svm_inject_exception,
> .cancel_injection = svm_cancel_injection,
> .interrupt_allowed = svm_interrupt_allowed,
> @@ -5058,6 +5113,16 @@ static __init int svm_hardware_setup(void)
> pr_info("Virtual GIF supported\n");
> }
>
> +
> + vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_AMD_VNMI);
> + if (vnmi)
> + pr_info("Virtual NMI enabled\n");
> +
> + if (!vnmi) {
> + svm_x86_ops.get_hw_nmi_pending = NULL;
> + svm_x86_ops.set_hw_nmi_pending = NULL;
> + }
> +
> if (lbrv) {
> if (!boot_cpu_has(X86_FEATURE_LBRV))
> lbrv = false;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 587ddc150f9f34..0b7e1790fadde1 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -35,6 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
> extern bool npt_enabled;
> extern int vgif;
> extern bool intercept_smi;
> +extern bool vnmi;
>
> enum avic_modes {
> AVIC_MODE_NONE = 0,
> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
> (msr < (APIC_BASE_MSR + 0x100));
> }
>
> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> +{
> + /* L1's vNMI is inhibited while nested guest is running */
> + if (is_guest_mode(&svm->vcpu))
> + return false;
> +
> + return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> +}
> +
> /* svm.c */
> #define MSR_INVALID 0xffffffffU
>