RE: [PATCH v2 5/5] KVM: nVMX: Enable nested posted interrupt processing.

From: Zhang, Yang Z
Date: Wed Jan 21 2015 - 03:10:01 EST


Wincy Van wrote on 2015-01-20:
> If vcpu has a interrupt in vmx non-root mode, we will kick that vcpu
> to inject interrupt timely. With posted interrupt processing, the kick
> intr is not needed, and interrupts are fully taken care of by hardware.
>
> In nested vmx, this feature avoids much more vmexits than non-nested vmx.
>
> This patch use L0's POSTED_INTR_NV to avoid unexpected interrupt if
> L1's vector is different with L0's. If vcpu is in hardware's non-root
> mode, we use a physical ipi to deliver posted interrupts, otherwise we
> will deliver that interrupt to L1 and kick that vcpu out of nested non-root mode.
>
> Signed-off-by: Wincy Van <fanwenyi0529@xxxxxxxxx>
> ---
> arch/x86/kvm/vmx.c | 136
> ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed,
> 132 insertions(+), 4 deletions(-)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> ea56e9f..cda9133 100644 --- a/arch/x86/kvm/vmx.c +++
> b/arch/x86/kvm/vmx.c @@ -215,6 +215,7 @@ struct __packed vmcs12 {
> u64 tsc_offset; u64 virtual_apic_page_addr; u64
> apic_access_addr; + u64 posted_intr_desc_addr; u64
> ept_pointer; u64 eoi_exit_bitmap0; u64 eoi_exit_bitmap1; @@
> -334,6 +335,7 @@ struct __packed vmcs12 { u32
> vmx_preemption_timer_value; u32 padding32[7]; /* room for future
> expansion */ u16 virtual_processor_id; + u16
> posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector;
> u16 guest_ss_selector; @@ -387,6 +389,7 @@ struct nested_vmx {
> /* The host-usable pointer to the above */ struct page
> *current_vmcs12_page; struct vmcs12 *current_vmcs12; +
> spinlock_t vmcs12_lock; struct vmcs *current_shadow_vmcs; /*
> * Indicates if the shadow vmcs must be updated with the @@
> -406,6 +409,8 @@ struct nested_vmx { */
> struct page *apic_access_page;
> struct page *virtual_apic_page;
> + struct page *pi_desc_page;
> + struct pi_desc *pi_desc;
> u64 msr_ia32_feature_control;
>
> struct hrtimer preemption_timer; @@ -621,6 +626,7 @@ static
> int max_shadow_read_write_fields =
>
> static const unsigned short vmcs_field_to_offset_table[] = {
> FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), +
> FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR,
> guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
> FIELD(GUEST_SS_SELECTOR, guest_ss_selector), @@ -646,6 +652,7 @@
> static const unsigned short vmcs_field_to_offset_table[] = {
> FIELD64(TSC_OFFSET, tsc_offset), FIELD64(VIRTUAL_APIC_PAGE_ADDR,
> virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR,
> apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR,
> posted_intr_desc_addr), FIELD64(EPT_POINTER, ept_pointer),
> FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), @@ -798,6 +805,7
> @@ static void kvm_cpu_vmxon(u64 addr); static void
> kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static
> bool vmx_xsaves_supported(void);
> +static int vmx_vm_has_apicv(struct kvm *kvm);
> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
> static void vmx_set_segment(struct kvm_vcpu *vcpu,
> struct kvm_segment *var, int seg); @@
> -1159,6 +1167,11 @@ static inline bool nested_cpu_has_vid(struct
> vmcs12 *vmcs12)
> return nested_cpu_has2(vmcs12,
> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
> }
> +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) {
> + return vmcs12->pin_based_vm_exec_control &
> +PIN_BASED_POSTED_INTR; }
> +
> static inline bool is_exception(u32 intr_info) {
> return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
> INTR_INFO_VALID_MASK)) @@ -2362,6 +2375,9 @@ static void
> nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
> vmx->nested.nested_vmx_pinbased_ctls_high |=
> PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
> PIN_BASED_VMX_PREEMPTION_TIMER;
> + if (vmx_vm_has_apicv(vmx->vcpu.kvm))
> + vmx->nested.nested_vmx_pinbased_ctls_high |=
> + PIN_BASED_POSTED_INTR;
>
> /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, @@ -4267,6
> +4283,46 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return
> enable_apicv && irqchip_in_kernel(kvm); }
> +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
> + int vector) {
> + int r = 0;
> + struct vmcs12 *vmcs12;
> +
> + /*
> + * Since posted intr delivery is async,
> + * we must aquire a spin-lock to avoid
> + * the race of vmcs12.
> + */
> + spin_lock(&to_vmx(vcpu)->nested.vmcs12_lock);
> + vmcs12 = get_vmcs12(vcpu);
> + if (!is_guest_mode(vcpu) || !vmcs12) {
> + r = -1;
> + goto out;
> + }
> + if (vector == vmcs12->posted_intr_nv &&
> + nested_cpu_has_posted_intr(vmcs12)) {
> + if (vcpu->mode == IN_GUEST_MODE)
> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> + POSTED_INTR_VECTOR);
> + else {
> + r = -1;
> + goto out;
> + }
> +
> + /*
> + * if posted intr is done by hardware, the
> + * corresponding eoi was sent to L0. Thus
> + * we should send eoi to L1 manually.
> + */
> + kvm_apic_set_eoi_accelerated(vcpu,
> + vmcs12->posted_intr_nv);

Why this is necessary? As your comments mentioned, it is done by hardware not L1, why L1 should aware of it?

Best regards,
Yang