[RFC] KVM: arm/arm64: optimize vSGI injection performance

From: Xu Zhao
Date: Fri Aug 18 2023 - 06:48:37 EST


In the worst case scenario, it may iterate over all vCPUs in the vm in order to complete
injecting an SGI interrupt. However, the ICC_SGI_* register provides affinity routing information,
and we are interested in exploring the possibility of utilizing this information to reduce iteration
times from a total of vcpu numbers to 16 (the length of the targetlist), or even 8 times.

This work is based on v5.4, and here is test data:
4 cores with vcpu pinning:
| ipi benchmark | vgic_v3_dispatch_sgi |
| original | with patch | impoved | original | with patch | impoved |
| core0 -> core1 | 292610285 ns | 299856696 ns | -2.5% | 1471 ns | 1508 ns | -2.5% |
| core0 -> core3 | 333815742 ns | 327647989 ns | +1.8% | 1578 ns | 1532 ns | +2.9% |
| core0 -> all | 439754104 ns | 433987192 ns | +1.3% | 2970 ns | 2875 ns | +3.2% |

32 cores with vcpu pinning:
| ipi benchmark | vgic_v3_dispatch_sgi |
| original | with patch | impoved | original | with patch | impoved |
| core0 -> core1 | 269153219 ns | 261636906 ns | +2.8% | 1743 ns | 1706 ns | +2.1% |
| core0 -> core31 | 685199666 ns | 355250664 ns | +48.2% | 4238 ns | 1838 ns | +56.6% |
| core0 -> all | 7281278980 ns | 3403240773 ns | +53.3% | 30879 ns | 13843 ns | +55.2% |

Based on the test results, the performance of vm with less than 16 cores remains almost the same,
while significant improvement can be observed with more than 16 cores.

Signed-off-by: Xu Zhao <zhaoxu.35@xxxxxxxxxxxxx>
---
include/linux/kvm_host.h | 5 ++
virt/kvm/arm/vgic/vgic-mmio-v3.c | 136 +++++++++++++++----------------
2 files changed, 73 insertions(+), 68 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 027e155daf8c..efc7b96946c1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -580,6 +580,11 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
(vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
idx++)

+#define kvm_for_each_target_vcpus(idx, target_cpus) \
+ for (idx = target_cpus & 0xff ? 0 : (ICC_SGI1R_AFFINITY_1_SHIFT>>1); \
+ (1 << idx) <= target_cpus; \
+ idx++)
+
static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
{
struct kvm_vcpu *vcpu = NULL;
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 24011280c4b1..bb64148ab75b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -852,44 +852,60 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)

return 0;
}
+
/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
+ * Get affinity routing index from ICC_SGI_* register
+ * format:
+ * aff3 aff2 aff1 aff0
+ * |- 8 bits -|- 8 bits -|- 8 bits -|- 4 bits or 8bits -|
*/
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+static u64 sgi_to_affinity(unsigned long reg)
{
- unsigned long affinity;
- int level0;
+ u64 aff;

- /*
- * Split the current VCPU's MPIDR into affinity level 0 and the
- * rest as this is what we have to compare against.
- */
- affinity = kvm_vcpu_get_mpidr_aff(vcpu);
- level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
- affinity &= ~MPIDR_LEVEL_MASK;
+ /* aff3 - aff1 */
+ aff = (((reg) & ICC_SGI1R_AFFINITY_3_MASK) >> ICC_SGI1R_AFFINITY_3_SHIFT) << 16 |
+ (((reg) & ICC_SGI1R_AFFINITY_2_MASK) >> ICC_SGI1R_AFFINITY_2_SHIFT) << 8 |
+ (((reg) & ICC_SGI1R_AFFINITY_1_MASK) >> ICC_SGI1R_AFFINITY_1_SHIFT);

- /* bail out if the upper three levels don't match */
- if (sgi_aff != affinity)
- return -1;
+ /* if use range selector, TargetList[n] represents aff0 value ((RS * 16) + n) */
+ if ((reg) & ICC_SGI1R_RS_MASK) {
+ aff <<= 4;
+ aff |= (reg) & ICC_SGI1R_RS_MASK;
+ }

- /* Is this VCPU's bit set in the mask ? */
- if (!(sgi_cpu_mask & BIT(level0)))
- return -1;
+ /* aff0, the length of targetlist in sgi register is 16, which is 4bit */
+ aff <<= 4;

- return level0;
+ return aff;
}

/*
- * The ICC_SGI* registers encode the affinity differently from the MPIDR,
- * so provide a wrapper to use the existing defines to isolate a certain
- * affinity level.
+ * inject a vsgi to vcpu
*/
-#define SGI_AFFINITY_LEVEL(reg, level) \
- ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
- >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+static inline void vgic_v3_inject_sgi(struct kvm_vcpu *vcpu, int sgi, bool allow_group1)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, sgi);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ /*
+ * An access targeting Group0 SGIs can only generate
+ * those, while an access targeting Group1 SGIs can
+ * generate interrupts of either group.
+ */
+ if (!irq->group || allow_group1) {
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+}

/**
* vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
@@ -910,64 +926,48 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
struct kvm *kvm = vcpu->kvm;
struct kvm_vcpu *c_vcpu;
u16 target_cpus;
- u64 mpidr;
int sgi, c;
int vcpu_id = vcpu->vcpu_id;
bool broadcast;
- unsigned long flags;
+ u64 aff_index;

sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
- mpidr = SGI_AFFINITY_LEVEL(reg, 3);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 1);

/*
- * We iterate over all VCPUs to find the MPIDRs matching the request.
- * If we have handled one CPU, we clear its bit to detect early
- * if we are already finished. This avoids iterating through all
- * VCPUs when most of the times we just signal a single VCPU.
+ * Writing IRM bit is not a frequent behavior, so separate SGI injection into two parts.
+ * If it is not broadcast, compute the affinity routing index first,
+ * then iterate targetlist to find the target VCPU.
+ * Or, inject sgi to all VCPUs but the calling one.
*/
- kvm_for_each_vcpu(c, c_vcpu, kvm) {
- struct vgic_irq *irq;
-
- /* Exit early if we have dealt with all requested CPUs */
- if (!broadcast && target_cpus == 0)
- break;
-
- /* Don't signal the calling VCPU */
- if (broadcast && c == vcpu_id)
- continue;
+ if (likely(!broadcast)) {
+ /* compute affinity routing index */
+ aff_index = sgi_to_affinity(reg);

- if (!broadcast) {
- int level0;
+ /* Exit if meet a wrong affinity value */
+ if (aff_index >= atomic_read(&kvm->online_vcpus))
+ return;

- level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
- if (level0 == -1)
+ /* Iterate target list */
+ kvm_for_each_target_vcpus(c, target_cpus) {
+ if (!(target_cpus & (1 << c)))
continue;

- /* remove this matching VCPU from the mask */
- target_cpus &= ~BIT(level0);
- }
-
- irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ c_vcpu = kvm_get_vcpu_by_id(kvm, aff_index+c);
+ if (!c_vcpu)
+ break;

- /*
- * An access targetting Group0 SGIs can only generate
- * those, while an access targetting Group1 SGIs can
- * generate interrupts of either group.
- */
- if (!irq->group || allow_group1) {
- irq->pending_latch = true;
- vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
- } else {
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_v3_inject_sgi(c_vcpu, sgi, allow_group1);
}
+ } else {
+ kvm_for_each_vcpu(c, c_vcpu, kvm) {
+ /* don't signal the calling vcpu */
+ if (c_vcpu->vcpu_id == vcpu_id)
+ continue;

- vgic_put_irq(vcpu->kvm, irq);
+ vgic_v3_inject_sgi(c_vcpu, sgi, allow_group1);
+ }
}
}

--
2.20.1