[PATCH v3] KVM: x86/vmx: Suppress posted interrupt notification when CPU is in host

From: Chao Gao
Date: Wed Sep 28 2022 - 07:16:37 EST


PIN (Posted interrupt notification) is useless to host as KVM always syncs
pending guest interrupts in PID to guest's vAPIC before each VM entry. In
fact, Sending PINs to a CPU running in host will lead to additional
overhead due to interrupt handling.

Currently, software path, vmx_deliver_posted_interrupt(), is optimized to
issue PINs only if target vCPU is in IN_GUEST_MODE. But hardware paths
(VT-d and Intel IPI virtualization) aren't optimized.

Suppress notification unless notification is necessary (when vCPU is
blocked or running in guest). Specifically, set PID.SN when kvm sets
OUTSIDE_GUEST_MODE and vCPUs wake up from blocking, and clear PID.SN when
kvm sets IN_GUEST_MODE and vCPUs starts to block. Also, don't toggle PID.SN
at any other places, e.g., vmx_vcpu_pi_load().

To allow x86 common code to toggle SN bit when kvm sets vcpu->mode,
a new kvm_x86_ops is added.

When IPI virtualization is enabled, this patch increases "perf bench" [*]
by 8.10%, and PIN count in 1 second drops from tens of thousands to
tens. But cpuid loop test shows this patch causes 0.94% overhead in
VM-exit round-trip latency.

[*] test cmd: perf bench sched pipe -T. Note that we change the source
code to pin two threads to two different vCPUs so that it can reproduce
stable results.

Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
---
Changes in v3:
- add a dedicated hook to toggle SN
- clear/set SN when kvm sets IN/OUTSIDE_GUEST_MODE
- use kvm_arch_vcpu_{un}blocking instead of vcpu_put/load to set/clear SN
for blocked vCPUs
- recollect perf data and update the figures in commit message

arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/vmx/posted_intr.c | 63 +++++++++++++++++-------------
arch/x86/kvm/vmx/posted_intr.h | 1 +
arch/x86/kvm/vmx/vmx.c | 22 ++++++++++-
arch/x86/kvm/x86.c | 20 ++++++++++
6 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 51f777071584..a2d77302d7a8 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -86,6 +86,7 @@ KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
KVM_X86_OP(deliver_interrupt)
KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
+KVM_X86_OP_OPTIONAL(pi_suppress_notification)
KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2c96c43c313a..66d67b198021 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1543,6 +1543,7 @@ struct kvm_x86_ops {
void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+ void (*pi_suppress_notification)(struct kvm_vcpu *vcpu, bool suppress);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..ae1794a213d7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -48,6 +48,38 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
return 0;
}

+/*
+ * Toggle Suppress Notification (SN) bit in PID.
+ *
+ * Note that after clearing SN, the check of PIR and setting ON bit
+ * ensure pending IRQs in PIR are not ignored.
+ */
+void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (suppress) {
+ pi_set_sn(pi_desc);
+ } else {
+ pi_clear_sn(pi_desc);
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * ON may be out of sync with PIR. Set ON if there are
+ * pending IRQs in PIR to ensure subsequent
+ * ->sync_pir_to_irr() won't leave the pending IRQs behind.
+ */
+ if (!pi_test_on(pi_desc) && !pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+ }
+}
+
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -69,15 +101,8 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* full update can be skipped as neither the vector nor the destination
* needs to be changed.
*/
- if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
- if (pi_test_and_clear_sn(pi_desc))
- goto after_clear_sn;
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu)
return;
- }

local_irq_save(flags);

@@ -101,11 +126,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
new.control = old.control;

/*
- * Clear SN (as above) and refresh the destination APIC ID to
- * handle task migration (@cpu != vcpu->cpu).
+ * Refresh the destination APIC ID to handle task migration
+ * (@cpu != vcpu->cpu).
*/
new.ndst = dest;
- new.sn = 0;

/*
* Restore the notification vector; in the blocking case, the
@@ -115,19 +139,6 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (pi_try_set_control(pi_desc, &old.control, new.control));

local_irq_restore(flags);
-
-after_clear_sn:
-
- /*
- * Clear SN before reading the bitmap. The VT-d firmware
- * writes the bitmap and reads SN atomically (5.2.3 in the
- * spec), so it doesn't really have a memory barrier that
- * pairs with this, but we cannot do that and we need one.
- */
- smp_mb__after_atomic();
-
- if (!pi_is_pir_empty(pi_desc))
- pi_set_on(pi_desc);
}

static bool vmx_can_use_vtd_pi(struct kvm *kvm)
@@ -193,8 +204,6 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)

void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
if (!vmx_needs_pi_wakeup(vcpu))
return;

@@ -207,7 +216,7 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
* its wait state and manually scheduling out.
*/
if (vcpu->preempted)
- pi_set_sn(pi_desc);
+ vmx_pi_suppress_notification(vcpu, true);
}

/*
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..9232b87bf63e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -102,5 +102,6 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress);

#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c9b49a09e6b5..084894c50b64 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6697,6 +6697,16 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
vmx_set_rvi(max_irr);
}

+static void vmx_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ vmx_pi_suppress_notification(vcpu, false);
+}
+
+static void vmx_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ vmx_pi_suppress_notification(vcpu, true);
+}
+
static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6706,7 +6716,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;

- if (pi_test_on(&vmx->pi_desc)) {
+ /*
+ * Check PIR if ON=1 || SN=1. This is required to avoid breaking
+ * halt-polling (and other call sites with SN=1). Notification
+ * during halt-polling is undesired. But VT-d engine and IPI
+ * virtualization don't set ON if SN=1. To ensure KVM can detect
+ * pending IRQs in time, check PIR as well if SN=1.
+ */
+ if (pi_test_on(&vmx->pi_desc) || pi_test_sn(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
* IOMMU can write to PID.ON, so the barrier matters even on UP.
@@ -8030,6 +8047,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
+ .vcpu_blocking = vmx_vcpu_blocking,
+ .vcpu_unblocking = vmx_vcpu_unblocking,

.update_exception_bitmap = vmx_update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
@@ -8089,6 +8108,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .pi_suppress_notification = vmx_pi_suppress_notification,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7374d768296..ce744e840423 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10426,6 +10426,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)

/* Store vcpu->apicv_active before vcpu->mode. */
smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+ /*
+ * Notification are necessary to trigger the delivery of posted
+ * IRQs when vCPU is running in guest. Enable notification before
+ * guest entry.
+ *
+ * Do this even if apicv is disabled for simplicity.
+ */
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, false);

kvm_vcpu_srcu_read_unlock(vcpu);

@@ -10538,6 +10547,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)

vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ /*
+ * Suppress notification when CPU is OUTSIDE_GUEST_MODE to avoid
+ * wasting time on handling interrupts. A notification to host/kvm
+ * just indicates some interrupts are posted for a vCPU. Since KVM
+ * always syncs pending interrupts in PIR to vAPIC IRR before guest
+ * entry (in ->sync_pir_to_irr()), notification isn't needed.
+ *
+ * Do this even if apicv is disabled for simplicity.
+ */
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, true);

/*
* Sync xfd before calling handle_exit_irqoff() which may

base-commit: 372d07084593dc7a399bf9bee815711b1fb1bcf2
--
2.25.1