[PATCH] Add support for multiple MSI on x86

From: Micha Nelissen
Date: Sun Feb 13 2011 - 15:36:59 EST


Patch is based on earlier patch from Matthew Wilcox. ---
arch/x86/kernel/apic/io_apic.c | 310 ++++++++++++++++++++++++++++++++++------
arch/x86/kernel/hpet.c | 2 +-
drivers/pci/htirq.c | 2 +-
include/linux/irq.h | 3 +-
4 files changed, 271 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fadcd74..5e9decc 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -249,11 +249,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
return cfg;
}

-static int alloc_irq_from(unsigned int from, int node)
-{
- return irq_alloc_desc_from(from, node);
-}
-
static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
{
free_irq_cfg(at, cfg);
@@ -1037,6 +1032,39 @@ void unlock_vector_lock(void)
raw_spin_unlock(&vector_lock);
}

+/*
+ * The P6 family and Pentium processors (presumably also earlier processors),
+ * can queue no more than two interrupts per priority level, and will ignore
+ * other interrupts that are received within the same priority level (the
+ * priority level is the vector number shifted right by 4), so we try to
+ * spread these out a bit to avoid this happening.
+ *
+ * Pentium 4, Xeon and later processors do not have this limitation.
+ * It is unknown what limitations AMD, Cyrix, Transmeta, VIA, IDT and
+ * other manufacturers have.
+ */
+static int many_vectors_per_prio(void)
+{
+ struct cpuinfo_x86 *c;
+ static char init, result;
+ if (init)
+ return result;
+
+ c = &boot_cpu_data;
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 > 6 ||
+ ((c->x86 == 6) && (c->x86_model >= 13)))
+ result = 1;
+ break;
+ default:
+ break;
+ }
+
+ init = 1;
+ return result;
+}
+
static int
__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
@@ -1117,13 +1145,110 @@ next:
return err;
}

+static int __assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+ static int current_vector = FIRST_EXTERNAL_VECTOR;
+ unsigned int old_vector;
+ unsigned i, cpu;
+ int err;
+ struct irq_cfg *cfg;
+ cpumask_var_t tmp_mask;
+
+ BUG_ON(irq + count > NR_IRQS);
+ BUG_ON(count & (count - 1));
+
+ for (i = 0; i < count; i++) {
+ cfg = irq_cfg(irq + i);
+ if (cfg->move_in_progress)
+ return -EBUSY;
+ }
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ cfg = irq_cfg(irq);
+ old_vector = cfg->vector;
+ if (old_vector) {
+ err = 0;
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask))
+ goto out;
+ }
+
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ int new_cpu;
+ int vector;
+
+ apic->vector_allocation_domain(cpu, tmp_mask);
+
+ vector = current_vector & ~(count - 1);
+next:
+ vector += count;
+ if (vector + count >= first_system_vector) {
+ vector = FIRST_EXTERNAL_VECTOR & ~(count - 1);
+ if (vector < FIRST_EXTERNAL_VECTOR)
+ vector += count;
+ }
+ if (unlikely((current_vector & ~(count - 1)) == vector))
+ continue;
+
+ for (i = 0; i < count; i++)
+ if (test_bit(vector + i, used_vectors))
+ goto next;
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ for (i = 0; i < count; i++) {
+ if (per_cpu(vector_irq, new_cpu)[vector + i] != -1)
+ goto next;
+ }
+ }
+ /* Found one! */
+ current_vector = vector + count - 1;
+ for (i = 0; i < count; i++) {
+ cfg = irq_cfg(irq + i);
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector + i] = irq + i;
+ cfg->vector = vector + i;
+ cpumask_copy(cfg->domain, tmp_mask);
+ }
+ err = 0;
+ break;
+ }
+out:
+ free_cpumask_var(tmp_mask);
+ return err;
+}
+
int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;

raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
+ if (many_vectors_per_prio())
+ err = __assign_irq_vector_block(irq, 1, mask);
+ else
+ err = __assign_irq_vector(irq, cfg, mask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int
+assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector_block(irq, count, mask);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
@@ -2200,14 +2325,34 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
unsigned int *dest_id)
{
struct irq_cfg *cfg = data->chip_data;
+ unsigned irq;

if (!cpumask_intersects(mask, cpu_online_mask))
return -1;

- if (assign_irq_vector(data->irq, data->chip_data, mask))
- return -1;
+ irq = data->irq;
+ cfg = data->chip_data;

- cpumask_copy(data->affinity, mask);
+ if (many_vectors_per_prio()) {
+ struct msi_desc *msi_desc = data->msi_desc;
+ unsigned i, count = 1;
+
+ if (msi_desc)
+ count = 1 << msi_desc->msi_attrib.multiple;
+
+ /* Multiple MSIs all go to the same destination */
+ if (assign_irq_vector_block(irq, count, mask))
+ return -1;
+ for (i = 0; i < count; i++) {
+ data = &irq_to_desc(irq + i)->irq_data;
+ cpumask_copy(data->affinity, mask);
+ }
+ } else {
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_copy(data->affinity, mask);
+ }

*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
return 0;
@@ -3053,7 +3198,7 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int from, int node)
+unsigned int create_irq_nr(unsigned int from, unsigned count, int node)
{
struct irq_cfg *cfg;
unsigned long flags;
@@ -3063,25 +3208,31 @@ unsigned int create_irq_nr(unsigned int from, int node)
if (from < nr_irqs_gsi)
from = nr_irqs_gsi;

- irq = alloc_irq_from(from, node);
+ irq = irq_alloc_descs(-1, from, count, node);
if (irq < 0)
return 0;
cfg = alloc_irq_cfg(irq, node);
if (!cfg) {
- free_irq_at(irq, NULL);
+ irq_free_descs(irq, count);
return 0;
}

raw_spin_lock_irqsave(&vector_lock, flags);
- if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
- ret = irq;
+ if (many_vectors_per_prio()) {
+ if (!__assign_irq_vector_block(irq, count, apic->target_cpus()))
+ ret = irq;
+ } else {
+ if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ ret = irq;
+ }
raw_spin_unlock_irqrestore(&vector_lock, flags);

if (ret) {
set_irq_chip_data(irq, cfg);
irq_clear_status_flags(irq, IRQ_NOREQUEST);
} else {
- free_irq_at(irq, cfg);
+ free_irq_cfg(irq, cfg);
+ irq_free_descs(irq, count);
}
return ret;
}
@@ -3093,7 +3244,7 @@ int create_irq(void)
int irq;

irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
+ irq = create_irq_nr(irq_want, 1, node);

if (irq == 0)
irq = -1;
@@ -3121,7 +3272,7 @@ void destroy_irq(unsigned int irq)
*/
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
- struct msi_msg *msg, u8 hpet_id)
+ unsigned count, struct msi_msg *msg, u8 hpet_id)
{
struct irq_cfg *cfg;
int err;
@@ -3131,7 +3282,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
return -ENXIO;

cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (count == 1)
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ else
+ err = assign_irq_vector_block(irq, count, apic->target_cpus());
if (err)
return err;

@@ -3307,47 +3461,99 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
return index;
}

-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ unsigned count, int base_irq)
{
struct msi_msg msg;
+ unsigned irq;
int ret;

- ret = msi_compose_msg(dev, irq, &msg, -1);
+ ret = msi_compose_msg(dev, base_irq, count, &msg, -1);
if (ret < 0)
return ret;

- set_irq_msi(irq, msidesc);
- write_msi_msg(irq, &msg);
+ msidesc->msi_attrib.multiple = order_base_2(count);

- if (irq_remapped(get_irq_chip_data(irq))) {
- irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
- } else
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ /* perform loop backwards, so first irq has msidesc set */
+ for (irq = base_irq + count - 1; irq >= base_irq; irq--) {
+ set_irq_msi(irq, msidesc);
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+ } else
+ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ }

- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+ write_msi_msg(base_irq, &msg);
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d-%d for MSI/MSI-X\n",
+ base_irq, base_irq + count - 1);

return 0;
}

-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int setup_msi_irqs(struct pci_dev *dev, int nvec)
+{
+ unsigned base_irq, alloc, i;
+ int ret, node;
+ struct msi_desc *msidesc = list_first_entry(&dev->msi_list,
+ struct msi_desc, list);
+ struct intel_iommu *iommu = map_dev_to_ir(dev);
+
+ if (intr_remapping_enabled && !iommu)
+ return -ENOENT;
+ if (nvec > 1 && !many_vectors_per_prio())
+ return 1;
+
+ /*
+ * MSI only lets you program the device with nvec that is a power
+ * of two. We could possibly trust the device driver that it'll
+ * only use the number it asked for, but to be safe, let's reserve
+ * all the interrupts we're telling the device it can use.
+ */
+ alloc = roundup_pow_of_two(nvec);
+ node = dev_to_node(&dev->dev);
+ base_irq = create_irq_nr(nr_irqs_gsi, alloc, node);
+ if (base_irq == 0)
+ return (alloc > 1) ? alloc / 2 : -ENOSPC;
+
+ if (intr_remapping_enabled) {
+ ret = msi_alloc_irte(dev, base_irq, alloc);
+ if (ret < 0)
+ goto error;
+
+ for (i = 1; i < alloc; i++)
+ set_irte_irq(base_irq + i, iommu, ret, i);
+ }
+
+ ret = setup_msi_irq(dev, msidesc, alloc, base_irq);
+ if (ret < 0)
+ goto error;
+
+ return 0;
+
+error:
+ for (i = 0; i < alloc; i++)
+ destroy_irq(base_irq + i);
+ return ret;
+}
+
+static int setup_msix_irqs(struct pci_dev *dev, int nvec)
{
int node, ret, sub_handle, index = 0;
+ struct intel_iommu *iommu = map_dev_to_ir(dev);
unsigned int irq, irq_want;
struct msi_desc *msidesc;
- struct intel_iommu *iommu = NULL;

- /* x86 doesn't support multiple MSI yet */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
+ if (intr_remapping_enabled && !iommu)
+ return -ENOENT;

node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
+ irq = create_irq_nr(irq_want, 1, node);
if (irq == 0)
- return -1;
+ return -ENOSPC;
irq_want = irq + 1;
if (!intr_remapping_enabled)
goto no_ir;
@@ -3363,11 +3569,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
goto error;
}
} else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
- goto error;
- }
/*
* setup the mapping between the irq and the IRTE
* base index, the sub_handle pointing to the
@@ -3376,7 +3577,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
- ret = setup_msi_irq(dev, msidesc, irq);
+ ret = setup_msi_irq(dev, msidesc, 1, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3388,11 +3589,34 @@ error:
return ret;
}

+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ if (type == PCI_CAP_ID_MSI) {
+ return setup_msi_irqs(dev, nvec);
+ } else {
+ return setup_msix_irqs(dev, nvec);
+ }
+}
+
void native_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
}

+void native_teardown_msi_irqs(struct pci_dev *dev)
+{
+ struct msi_desc *desc;
+ unsigned i;
+
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ if (desc->irq == 0)
+ continue;
+ for (i = 0; i < (1 << desc->msi_attrib.multiple); i++) {
+ destroy_irq(desc->irq + i);
+ }
+ }
+}
+
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
static int
@@ -3437,7 +3661,7 @@ int arch_setup_dmar_msi(unsigned int irq)
int ret;
struct msi_msg msg;

- ret = msi_compose_msg(NULL, irq, &msg, -1);
+ ret = msi_compose_msg(NULL, irq, 1, &msg, -1);
if (ret < 0)
return ret;
dmar_msi_write(irq, &msg);
@@ -3515,7 +3739,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
return -1;
}

- ret = msi_compose_msg(NULL, irq, &msg, id);
+ ret = msi_compose_msg(NULL, irq, 1, &msg, id);
if (ret < 0)
return ret;

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968..cce3afd 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -499,7 +499,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
{
unsigned int irq;

- irq = create_irq_nr(0, -1);
+ irq = create_irq_nr(0, 1, -1);
if (!irq)
return -EINVAL;

diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 834842a..2b48cc3 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -120,7 +120,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
cfg->msg.address_hi = 0xffffffff;

node = dev_to_node(&dev->dev);
- irq = create_irq_nr(0, node);
+ irq = create_irq_nr(0, 1, node);

if (irq <= 0) {
kfree(cfg);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index abde252..842a8c4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -322,7 +322,8 @@ static inline void set_irq_probe(unsigned int irq)
}

/* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want, int node);
+extern unsigned int create_irq_nr(unsigned int irq_want, unsigned count,
+ int node);
extern int create_irq(void);
extern void destroy_irq(unsigned int irq);

--
1.5.6.5