Re: [PATCH] KVM: SVM: Update destination when updating pi irte

From: Sean Christopherson
Date: Thu Jun 29 2023 - 18:35:41 EST


On Thu, May 18, 2023, Joao Martins wrote:
>
> On 18/05/2023 09:19, Maxim Levitsky wrote:
> > I think that we do need to a flag indicating if the vCPU is currently
> > running and if yes, then use svm->vcpu.cpu (or put -1 to it when it not
> > running or something) (currently the vcpu->cpu remains set when vCPU is
> > put)
> >
> > In other words if a vCPU is running, then avic_pi_update_irte should put
> > correct pCPU number, and if it raced with vCPU put/load, then later should
> > win and put the correct value. This can be done either with a lock or
> > barriers.
> >
> If this could be done, it could remove cost from other places and avoid this
> little dance of the galog (and avoid its usage as it's not the greatest design
> aspect of the IOMMU). We anyways already need to do IRT flushes in all these
> things with regards to updating any piece of the IRTE, but we need some care
> there two to avoid invalidating too much (which is just as expensive and per-VCPU).

...

> But still quite expensive (as many IPIs as vCPUs updated), but it works as
> intended and guest will immediately see the right vcpu affinity. But I honestly
> prefer going towards your suggestion (via vcpu.pcpu) if we can have some
> insurance that vcpu.cpu is safe to use in pi_update_irte if protected against
> preemption/blocking of the VCPU.

I think we have all the necessary info, and even a handy dandy spinlock to ensure
ordering. Disclaimers: compile tested only, I know almost nothing about the IOMMU
side of things, and I don't know if I understood the needs for the !IsRunning cases.

Disclaimers aside, this should point the IOMMU at the right pCPU when the target
vCPU changes and the new vCPU is actively running.

---
arch/x86/kvm/svm/avic.c | 44 +++++++++++++++++++++++++++++++++--------
1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index cfc8ab773025..703ad9af73eb 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
+ u64 entry;

/**
* In some cases, the existing irte is updated and re-set,
@@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
ir->data = pi->ir_data;

spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is running.
+ * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
+ * will update the pCPU info when the vCPU awkened and/or scheduled in.
+ * See also avic_vcpu_load().
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
+ true, pi->ir_data);
+
list_add(&ir->node, &svm->ir_list);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
out:
@@ -986,10 +999,11 @@ static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
int ret = 0;
- unsigned long flags;
struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);

+ lockdep_assert_held(&svm->ir_list_lock);
+
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;

@@ -997,19 +1011,15 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
if (list_empty(&svm->ir_list))
- goto out;
+ return 0;

list_for_each_entry(ir, &svm->ir_list, node) {
ret = amd_iommu_update_ga(cpu, r, ir->data);
if (ret)
- break;
+ return ret;
}
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ return 0;
}

void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1017,6 +1027,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;

lockdep_assert_preemption_disabled();

@@ -1033,6 +1044,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_vcpu_is_blocking(vcpu))
return;

+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);

@@ -1042,12 +1062,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}

void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;

lockdep_assert_preemption_disabled();

@@ -1057,10 +1080,15 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;

+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);

entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
}

void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)

base-commit: 88bb466c9dec4f70d682cf38c685324e7b1b3d60
--