[PATCH v3 4/4] x86: KVM: SVM: allow optionally to disable AVIC's IPI virtualization

From: Maxim Levitsky
Date: Mon Oct 02 2023 - 08:03:59 EST


On Zen2 (and likely on Zen1 as well), AVIC doesn't reliably detect a change
in the 'is_running' bit during ICR write emulation and might skip a
VM exit, if that bit was recently cleared.

The absence of the VM exit, leads to the KVM not waking up / triggering
nested vm exit on the target(s) of the IPI, which can, in some cases,
lead to unbounded delays in the guest execution.

As I recently discovered, a reasonable workaround exists: make the KVM
never set the is_running bit, which in essence disables the
IPI virtualization portion of AVIC making it equal to APICv without IPI
virtualization.

This workaround ensures that (*) all ICR writes always cause a VM exit
and therefore correctly emulated, in expense of never enjoying VM exit-less
ICR write emulation.

To let the user control the workaround, a new kvm_amd module parameter was
added: 'enable_ipiv', using the same name as IPI virtualization of VMX.

However unlike VMX, this parameter is tri-state: 0, 1, -1.
-1 is the default value which instructs KVM to choose the default based
on the CPU model.

(*) More correctly all ICR writes except when the 'Self' shorthand is used:

In this case AVIC skips reading physid table and just sets bits in IRR
of local APIC. Thankfully in this case, the errata is not possible,
therefore an extra workaround is not needed.

Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
---
arch/x86/kvm/svm/avic.c | 51 +++++++++++++++++++++++++++++++----------
1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index bdab28005ad3405..b3ec693083cc883 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -62,6 +62,9 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);

+static int enable_ipiv = -1;
+module_param(enable_ipiv, int, 0444);
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -1024,7 +1027,6 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)

void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
@@ -1053,14 +1055,22 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);

- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ /*
+ * Do not update the actual physical id table entry, if the IPI
+ * virtualization portion of AVIC is not enabled.
+ * In this case all ICR writes except Self IPIs will be intercepted.
+ */
+
+ if (enable_ipiv) {
+ u64 entry = READ_ONCE(*svm->avic_physical_id_cache);

- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
- entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }

- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);

spin_unlock_irqrestore(&svm->ir_list_lock, flags);
@@ -1068,7 +1078,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
- u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;

@@ -1093,11 +1102,17 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)

avic_update_iommu_vcpu_affinity(vcpu, -1, 0);

- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ /*
+ * Do not update the actual physical id table entry if the IPI
+ * virtualization is disabled. See explanation in avic_vcpu_load().
+ */
+ if (enable_ipiv) {
+ u64 entry = READ_ONCE(*svm->avic_physical_id_cache);

- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ }

spin_unlock_irqrestore(&svm->ir_list_lock, flags);

@@ -1211,5 +1226,17 @@ bool avic_hardware_setup(void)

amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);

+ if (enable_ipiv == -1) {
+ enable_ipiv = 1;
+ /* Assume that Zen1 and Zen2 have errata #1235 */
+ if (boot_cpu_data.x86 == 0x17) {
+ pr_info("AVIC's IPI virtualization disabled due to errata #1235\n");
+ enable_ipiv = 0;
+ }
+ }
+
+ if (enable_ipiv)
+ pr_info("AVIC's IPI virtualization enabled\n");
+
return true;
}
--
2.26.3