Re: [PATCH v2 05/11] KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction

From: Maxim Levitsky
Date: Thu Apr 28 2022 - 05:49:13 EST


On Sat, 2022-04-23 at 02:14 +0000, Sean Christopherson wrote:
> Re-inject INT3/INTO instead of retrying the instruction if the CPU
> encountered an intercepted exception while vectoring the software
> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
> shadow paging. Retrying the instruction is architecturally wrong, e.g.
> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
> and lack of re-injection also breaks nested virtualization, e.g. if L1
> injects a software exception and vectoring the injected exception
> encounters an exception that is intercepted by L0 but not L1.
>
> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
> RIP may require flowing through the emulator even if NRIPS is supported,
> as the CPU clears next_rip if the VM-Exit is due to an exception other
> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
> deal with this, "skip" the instruction to calculate next_rip (if it's
> not already known), and then unwind the RIP write and any side effects
> (RFLAGS updates).
>
> Save the computed next_rip and use it to re-stuff next_rip if injection
> doesn't complete. This allows KVM to do the right thing if next_rip was
> known prior to injection, e.g. if L1 injects a soft event into L2, and
> there is no backing INTn instruction, e.g. if L1 is injecting an
> arbitrary event.
>
> Note, it's impossible to guarantee architectural correctness given SVM's
> architectural flaws. E.g. if the guest executes INTn (no KVM injection),
> an exit occurs while vectoring the INTn, and the guest modifies the code
> stream while the exit is being handled, KVM will compute the incorrect
> next_rip due to "skipping" the wrong instruction. A future enhancement
> to make this less awful would be for KVM to detect that the decoded
> instruction is not the correct INTn and drop the to-be-injected soft
> event (retrying is a lesser evil compared to shoving the wrong RIP on the
> exception stack).
>
> Reported-by: Maciej S. Szmigiero <maciej.szmigiero@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
> arch/x86/kvm/svm/nested.c | 28 +++++++-
> arch/x86/kvm/svm/svm.c | 140 +++++++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.h | 6 +-
> 3 files changed, 130 insertions(+), 44 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 461c5f247801..0163238aa198 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
> }
> }
>
> +static inline bool is_evtinj_soft(u32 evtinj)
> +{
> + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
> + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
> +
> + if (!(evtinj & SVM_EVTINJ_VALID))
> + return false;
> +
> + /*
> + * Intentionally return false for SOFT events, SVM doesn't yet support
> + * re-injecting soft interrupts.
> + */
> + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
> +}
> +
> static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> unsigned long vmcb12_rip)
> {
> @@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> else if (boot_cpu_has(X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = vmcb12_rip;
>
> + if (is_evtinj_soft(vmcb02->control.event_inj)) {
> + svm->soft_int_injected = true;
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = vmcb12_rip;
> + if (svm->nrips_enabled)
> + svm->soft_int_next_rip = svm->nested.ctl.next_rip;
> + else
> + svm->soft_int_next_rip = vmcb12_rip;
> + }
> +
> vmcb02->control.virt_ext = vmcb01->control.virt_ext &
> LBR_CTL_ENABLE_MASK;
> if (svm->lbrv_enabled)
> @@ -849,6 +874,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>
> out_exit_err:
> svm->nested.nested_run_pending = 0;
> + svm->soft_int_injected = false;
>
> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> svm->vmcb->control.exit_code_hi = 0;
> @@ -1618,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> nested_copy_vmcb_control_to_cache(svm, ctl);
>
> svm_switch_vmcb(svm, &svm->nested.vmcb02);
> - nested_vmcb02_prepare_control(svm, save->rip);
> + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip);

Is this change intentional?

>
> /*
> * While the nested guest CR3 is already checked and set by
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 14bc4e87437b..8321f9ce5e35 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -342,9 +342,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
>
> }
>
> -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
> + bool commit_side_effects)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> + unsigned long old_rflags;
>
> /*
> * SEV-ES does not expose the next RIP. The RIP update is controlled by
> @@ -359,18 +361,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> }
>
> if (!svm->next_rip) {
> + if (unlikely(!commit_side_effects))
> + old_rflags = svm->vmcb->save.rflags;
> +
> if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
> return 0;
> +
> + if (unlikely(!commit_side_effects))
> + svm->vmcb->save.rflags = old_rflags;
> } else {
> kvm_rip_write(vcpu, svm->next_rip);
> }
>
> done:
> - svm_set_interrupt_shadow(vcpu, 0);
> + if (likely(commit_side_effects))
> + svm_set_interrupt_shadow(vcpu, 0);
>
> return 1;
> }
>
> +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +{
> + return __svm_skip_emulated_instruction(vcpu, true);
> +}
> +
> +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
> +{
> + unsigned long rip, old_rip = kvm_rip_read(vcpu);
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + /*
> + * Due to architectural shortcomings, the CPU doesn't always provide
> + * NextRIP, e.g. if KVM intercepted an exception that occurred while
> + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
> + * the instruction even if NextRIP is supported to acquire the next
> + * RIP so that it can be shoved into the NextRIP field, otherwise
> + * hardware will fail to advance guest RIP during event injection.
> + * Drop the exception/interrupt if emulation fails and effectively
> + * retry the instruction, it's the least awful option. If NRIPS is
> + * in use, the skip must not commit any side effects such as clearing
> + * the interrupt shadow or RFLAGS.RF.
> + */
> + if (!__svm_skip_emulated_instruction(vcpu, !nrips))
> + return -EIO;
> +
> + rip = kvm_rip_read(vcpu);
> +
> + /*
> + * Save the injection information, even when using next_rip, as the
> + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
> + * doesn't complete due to a VM-Exit occurring while the CPU is
> + * vectoring the event. Decoding the instruction isn't guaranteed to
> + * work as there may be no backing instruction, e.g. if the event is
> + * being injected by L1 for L2, or if the guest is patching INT3 into
> + * a different instruction.
> + */
> + svm->soft_int_injected = true;
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = old_rip;
> + svm->soft_int_next_rip = rip;
> +
> + if (nrips)
> + kvm_rip_write(vcpu, old_rip);
> +
> + if (static_cpu_has(X86_FEATURE_NRIPS))
> + svm->vmcb->control.next_rip = rip;
> +
> + return 0;
> +}
> +
> static void svm_queue_exception(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -380,25 +439,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
>
> kvm_deliver_exception_payload(vcpu);
>
> - if (nr == BP_VECTOR && !nrips) {
> - unsigned long rip, old_rip = kvm_rip_read(vcpu);
> -
> - /*
> - * For guest debugging where we have to reinject #BP if some
> - * INT3 is guest-owned:
> - * Emulate nRIP by moving RIP forward. Will fail if injection
> - * raises a fault that is not intercepted. Still better than
> - * failing in all cases.
> - */
> - (void)svm_skip_emulated_instruction(vcpu);
> - rip = kvm_rip_read(vcpu);
> -
> - if (boot_cpu_has(X86_FEATURE_NRIPS))
> - svm->vmcb->control.next_rip = rip;
> -
> - svm->int3_rip = rip + svm->vmcb->save.cs.base;
> - svm->int3_injected = rip - old_rip;
> - }
> + if (kvm_exception_is_soft(nr) &&
> + svm_update_soft_interrupt_rip(vcpu))
> + return;
>
> svm->vmcb->control.event_inj = nr
> | SVM_EVTINJ_VALID
> @@ -3671,15 +3714,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
> svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
> }
>
> +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
> + int type)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + /*
> + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
> + * associated with the original soft exception/interrupt. next_rip is
> + * cleared on all exits that can occur while vectoring an event, so KVM
> + * needs to manually set next_rip for re-injection. Unlike the !nrips
> + * case below, this needs to be done if and only if KVM is re-injecting
> + * the same event, i.e. if the event is a soft exception/interrupt,
> + * otherwise next_rip is unused on VMRUN.
> + */
> + if (nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> + kvm_exception_is_soft(vector) &&
> + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
> + svm->vmcb->control.next_rip = svm->soft_int_next_rip;
> + /*
> + * If NRIPS isn't enabled, KVM must manually advance RIP prior to
> + * injecting the soft exception/interrupt. That advancement needs to
> + * be unwound if vectoring didn't complete. Note, the new event may
> + * not be the injected event, e.g. if KVM injected an INTn, the INTn
> + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> + * be the reported vectored event, but RIP still needs to be unwound.
> + */
> + else if (!nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
> + kvm_rip_write(vcpu, svm->soft_int_old_rip);
> +}
> +
> static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> u8 vector;
> int type;
> u32 exitintinfo = svm->vmcb->control.exit_int_info;
> - unsigned int3_injected = svm->int3_injected;
> + bool soft_int_injected = svm->soft_int_injected;
>
> - svm->int3_injected = 0;
> + svm->soft_int_injected = false;
>
> /*
> * If we've made progress since setting HF_IRET_MASK, we've
> @@ -3704,17 +3778,8 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
> type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
>
> - /*
> - * If NextRIP isn't enabled, KVM must manually advance RIP prior to
> - * injecting the soft exception/interrupt. That advancement needs to
> - * be unwound if vectoring didn't complete. Note, the new event may
> - * not be the injected event, e.g. if KVM injected an INTn, the INTn
> - * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> - * be the reported vectored event, but RIP still needs to be unwound.
> - */
> - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> - kvm_is_linear_rip(vcpu, svm->int3_rip))
> - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
> + if (soft_int_injected)
> + svm_complete_soft_interrupt(vcpu, vector, type);
>
> switch (type) {
> case SVM_EXITINTINFO_TYPE_NMI:
> @@ -3727,13 +3792,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> if (vector == X86_TRAP_VC)
> break;
>
> - /*
> - * In case of software exceptions, do not reinject the vector,
> - * but re-execute the instruction instead.
> - */
> - if (kvm_exception_is_soft(vector))
> - break;
> -
> if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
> u32 err = svm->vmcb->control.exit_int_info_err;
> kvm_requeue_exception_e(vcpu, vector, err);
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 7d97e4d18c8b..6acb494e3598 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -230,8 +230,10 @@ struct vcpu_svm {
> bool nmi_singlestep;
> u64 nmi_singlestep_guest_rflags;
>
> - unsigned int3_injected;
> - unsigned long int3_rip;
> + unsigned long soft_int_csbase;
> + unsigned long soft_int_old_rip;
> + unsigned long soft_int_next_rip;
> + bool soft_int_injected;
>
> /* optional nested SVM features that are enabled for this guest */
> bool nrips_enabled : 1;


Overall looks good, but I need to give it a bit more though to be sure,
I'll wait for next version.

Best regards,
Maxim Levitsky