[PATCH 05/13] KVM: Update IRTE according to guest interrupt configuration changes

From: Feng Wu
Date: Mon Nov 10 2014 - 01:36:39 EST


When guest changes its interrupt configuration (such as, vector, etc.)
for direct-assigned devices, we need to update the associated IRTE
with the new guest vector, so external interrupts from the assigned
devices can be injected to guests without VM-Exit.

The current method of handling guest lowest priority interrtups
is to use a counter 'apic_arb_prio' for each VCPU, we choose the
VCPU with smallest 'apic_arb_prio' and then increase it by 1.
However, for VT-d PI, we cannot re-use this, since we no longer
have control to 'apic_arb_prio' with posted interrupt direct
delivery by Hardware.

Here, we introduce a similiar way with 'apic_arb_prio' to handle
guest lowest priority interrtups when VT-d PI is used. Here is the
ideas:
- Each VCPU has a counter 'round_robin_counter'.
- When guests sets an interrupts to lowest priority, we choose
the VCPU with smallest 'round_robin_counter' as the destination,
then increase it.

Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
---
arch/x86/include/asm/irq_remapping.h | 6 ++
arch/x86/include/asm/kvm_host.h | 2 +
arch/x86/kvm/vmx.c | 12 +++
arch/x86/kvm/x86.c | 11 +++
drivers/iommu/amd_iommu.c | 6 ++
drivers/iommu/intel_irq_remapping.c | 28 +++++++
drivers/iommu/irq_remapping.c | 9 ++
drivers/iommu/irq_remapping.h | 3 +
include/linux/dmar.h | 26 ++++++
include/linux/kvm_host.h | 22 +++++
include/uapi/linux/kvm.h | 1 +
virt/kvm/assigned-dev.c | 141 ++++++++++++++++++++++++++++++++++
virt/kvm/irq_comm.c | 4 +-
virt/kvm/irqchip.c | 11 ---
14 files changed, 269 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index a3cc437..32d6cc4 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -51,6 +51,7 @@ extern void compose_remapped_msi_msg(struct pci_dev *pdev,
unsigned int irq, unsigned int dest,
struct msi_msg *msg, u8 hpet_id);
extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
+extern int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector);
extern void panic_if_irq_remap(const char *msg);
extern bool setup_remapped_irq(int irq,
struct irq_cfg *cfg,
@@ -88,6 +89,11 @@ static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
return -ENODEV;
}

+static inline int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector)
+{
+ return -ENODEV;
+}
+
static inline void panic_if_irq_remap(const char *msg)
{
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ed0c30..0630161 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
struct kvm_lapic *apic; /* kernel irqchip context */
unsigned long apic_attention;
int32_t apic_arb_prio;
+ int32_t round_robin_counter;
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
@@ -771,6 +772,7 @@ struct kvm_x86_ops {
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);

void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+ u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
};

struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4670d3..ae91b72 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -544,6 +544,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}

+struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -4280,6 +4285,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
return;
}

+static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu)
+{
+ return __pa((u64)vcpu_to_pi_desc(vcpu));
+}
+
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -9232,6 +9242,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_nested_events = vmx_check_nested_events,

.sched_in = vmx_sched_in,
+
+ .get_pi_desc_addr = vmx_get_pi_desc_addr,
};

static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b447a98..0c19d15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7735,6 +7735,17 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);

+int kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ u32 guest_vector, int host_irq)
+{
+ u64 pi_desc_addr = kvm_x86_ops->get_pi_desc_addr(vcpu);
+
+ if (update_pi_irte(host_irq, pi_desc_addr, guest_vector))
+ return -1;
+
+ return 0;
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 505a9ad..a36fdc7 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4280,6 +4280,11 @@ static int alloc_hpet_msi(unsigned int irq, unsigned int id)
return 0;
}

+static int dummy_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector)
+{
+ return -EINVAL;
+}
+
struct irq_remap_ops amd_iommu_irq_ops = {
.supported = amd_iommu_supported,
.prepare = amd_iommu_prepare,
@@ -4294,5 +4299,6 @@ struct irq_remap_ops amd_iommu_irq_ops = {
.msi_alloc_irq = msi_alloc_irq,
.msi_setup_irq = msi_setup_irq,
.alloc_hpet_msi = alloc_hpet_msi,
+ .update_pi_irte = dummy_update_pi_irte,
};
#endif
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 776da10..87c02fe 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1172,6 +1172,33 @@ static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id)
return ret;
}

+static int intel_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector)
+{
+ struct irte irte;
+
+ if (get_irte(irq, &irte))
+ return -1;
+
+ irte.irq_post_low.urg = 0;
+ irte.irq_post_low.vector = vector;
+ irte.irq_post_low.pda_l = (pi_desc_addr >> (32 - PDA_LOW_BIT)) &
+ ~(-1UL << PDA_LOW_BIT);
+ irte.irq_post_high.pda_h = (pi_desc_addr >> 32) &
+ ~(-1UL << PDA_HIGH_BIT);
+
+ irte.irq_post_low.__reserved_1 = 0;
+ irte.irq_post_low.__reserved_2 = 0;
+ irte.irq_post_low.__reserved_3 = 0;
+ irte.irq_post_high.__reserved_4 = 0;
+
+ irte.irq_post_low.pst = 1;
+
+ if (modify_irte(irq, &irte))
+ return -1;
+
+ return 0;
+}
+
struct irq_remap_ops intel_irq_remap_ops = {
.supported = intel_irq_remapping_supported,
.prepare = dmar_table_init,
@@ -1186,4 +1213,5 @@ struct irq_remap_ops intel_irq_remap_ops = {
.msi_alloc_irq = intel_msi_alloc_irq,
.msi_setup_irq = intel_msi_setup_irq,
.alloc_hpet_msi = intel_alloc_hpet_msi,
+ .update_pi_irte = intel_update_pi_irte,
};
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 2f8ee00..0e36860 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -362,6 +362,15 @@ int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
return default_setup_hpet_msi(irq, id);
}

+int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector)
+{
+ if (!remap_ops || !remap_ops->update_pi_irte)
+ return -ENODEV;
+
+ return remap_ops->update_pi_irte(irq, pi_desc_addr, vector);
+}
+EXPORT_SYMBOL_GPL(update_pi_irte);
+
void panic_if_irq_remap(const char *msg)
{
if (irq_remapping_enabled)
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 7bb5913..2d8f740 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -84,6 +84,9 @@ struct irq_remap_ops {

/* Setup interrupt remapping for an HPET MSI */
int (*alloc_hpet_msi)(unsigned int, unsigned int);
+
+ /* Update IRTE for posted-interrupt */
+ int (*update_pi_irte)(int irq, u64 pi_desc_addr, u32 vector);
};

extern struct irq_remap_ops intel_irq_remap_ops;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 8be5d42..e1ff4f7 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -160,6 +160,20 @@ struct irte {
__reserved_2 : 8,
dest_id : 32;
} irq_remap_low;
+
+ struct {
+ __u64 present : 1,
+ fpd : 1,
+ __reserved_1 : 6,
+ avail : 4,
+ __reserved_2 : 2,
+ urg : 1,
+ pst : 1,
+ vector : 8,
+ __reserved_3 : 14,
+ pda_l : 26;
+ } irq_post_low;
+
__u64 low;
};

@@ -170,10 +184,22 @@ struct irte {
svt : 2,
__reserved_3 : 44;
} irq_remap_high;
+
+ struct {
+ __u64 sid: 16,
+ sq: 2,
+ svt: 2,
+ __reserved_4: 12,
+ pda_h: 32;
+ } irq_post_high;
+
__u64 high;
};
};

+#define PDA_LOW_BIT 26
+#define PDA_HIGH_BIT 32
+
enum {
IRQ_REMAP_XAPIC_MODE,
IRQ_REMAP_X2APIC_MODE,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea53b04..6bb8287 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -335,6 +335,25 @@ struct kvm_kernel_irq_routing_entry {
struct hlist_node link;
};

+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+
+struct kvm_irq_routing_table {
+ int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+ struct kvm_kernel_irq_routing_entry *rt_entries;
+ u32 nr_rt_entries;
+ /*
+ * Array indexed by gsi. Each entry contains list of irq chips
+ * the gsi is connected to.
+ */
+ struct hlist_head map[0];
+};
+
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
#ifndef KVM_PRIVATE_MEM_SLOTS
#define KVM_PRIVATE_MEM_SLOTS 0
#endif
@@ -766,6 +785,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
int kvm_request_irq_source_id(struct kvm *kvm);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
+bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq);

#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7593c52..509223a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1027,6 +1027,7 @@ struct kvm_s390_ucas_mapping {
#define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
+#define KVM_ASSIGN_DEV_PI_UPDATE _IOR(KVMIO, 0x7d, __u32)
/* Available with KVM_CAP_PIT_STATE2 */
#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index e05000e..e154009 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -326,6 +326,135 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
}
}

+int __weak kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ u32 guest_vector, int host_irq)
+{
+ return 0;
+}
+
+int kvm_compare_rr_counter(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.round_robin_counter -
+ vcpu2->arch.round_robin_counter;
+}
+
+bool kvm_pi_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu, *dest = NULL;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_is_dm_lowest_prio(irq)) {
+ r++;
+ *dest_vcpu = vcpu;
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!dest)
+ dest = vcpu;
+ else if (kvm_compare_rr_counter(vcpu, dest) < 0)
+ dest = vcpu;
+ }
+ }
+
+ if (dest) {
+ dest->arch.round_robin_counter++;
+ *dest_vcpu = dest;
+ return true;
+ } else if (r == 1)
+ return true;
+
+ return false;
+}
+
+static int __kvm_update_pi_irte(struct kvm *kvm, int host_irq, int guest_irq)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ int idx, ret = -EINVAL;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ ASSERT(guest_irq < irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d posted-interrupt has the following
+ * limitations:
+ * - No support for posting multicast/broadcast
+ * interrupts to a VCPU
+ * Still use interrupt remapping for these
+ * kind of interrupts
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_pi_find_dest_vcpu(kvm, &irq, &vcpu)) {
+ printk(KERN_INFO "%s: can not find the target VCPU\n",
+ __func__);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (kvm_update_pi_irte_common(kvm, vcpu, irq.vector,
+ host_irq)) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+int kvm_update_pi_irte(struct kvm *kvm, u32 dev_id)
+{
+ int i, rc = -1;
+ struct kvm_assigned_dev_kernel *dev;
+
+ mutex_lock(&kvm->lock);
+ dev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, dev_id);
+ if (!dev) {
+ printk(KERN_INFO "%s: cannot find the assigned dev.\n",
+ __func__);
+ rc = -1;
+ goto out;
+ }
+
+ BUG_ON(dev->irq_requested_type == 0);
+
+ if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) &&
+ (dev->dev->msi_enabled == 1)) {
+ __kvm_update_pi_irte(kvm,
+ dev->host_irq, dev->guest_irq);
+ } else if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) &&
+ (dev->dev->msix_enabled == 1)) {
+ for (i = 0; i < dev->entries_nr; i++) {
+ __kvm_update_pi_irte(kvm,
+ dev->host_msix_entries[i].vector,
+ dev->guest_msix_entries[i].vector);
+ }
+ }
+
+out:
+ rc = 0;
+ mutex_unlock(&kvm->lock);
+ return rc;
+}
+
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -1017,6 +1146,18 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
break;
}
+ case KVM_ASSIGN_DEV_PI_UPDATE: {
+ u32 dev_id;
+
+ r = -EFAULT;
+ if (copy_from_user(&dev_id, argp, sizeof(dev_id)))
+ goto out;
+ r = kvm_update_pi_irte(kvm, dev_id);
+ if (r)
+ goto out;
+ break;
+
+ }
default:
r = -ENOTTY;
break;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 963b899..f51aed3 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -55,7 +55,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
line_status);
}

-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
+bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
{
#ifdef CONFIG_IA64
return irq->delivery_mode ==
@@ -106,7 +106,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}

-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 7f256f3..cdf29a6 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,17 +31,6 @@
#include <trace/events/kvm.h>
#include "irq.h"

-struct kvm_irq_routing_table {
- int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
- struct kvm_kernel_irq_routing_entry *rt_entries;
- u32 nr_rt_entries;
- /*
- * Array indexed by gsi. Each entry contains list of irq chips
- * the gsi is connected to.
- */
- struct hlist_head map[0];
-};
-
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/