[PATCH RFC 09/13] x86/irq: Install posted MSI notification handler

From: Jacob Pan
Date: Sat Nov 11 2023 - 23:12:45 EST


All MSI vectors are multiplexed into a single notification vector when
posted MSI is enabled. It is the responsibility of the notification
vector handler to demultiplex MSI vectors. In this handler, for each
pending bit, MSI vector handlers are dispatched without IDT delivery.

For example, the interrupt flow will change as follows:
(3 MSIs of different vectors arrive in a a high frequency burst)

BEFORE:
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()

AFTER:
interrupt /* Posted MSI notification vector */
irq_enter()
atomic_xchg(PIR)
handler()
handler()
handler()
pi_clear_on()
apic_eoi()
irq_exit()
process_softirq()

Except for the leading MSI, CPU notifications are skipped/coalesced.

For MSIs arrive at a low frequency, the demultiplexing loop does not
wait for more interrupts to coalesce. Therefore, there's no additional
latency other than the processing time.

Signed-off-by: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/hardirq.h | 3 ++
arch/x86/include/asm/idtentry.h | 3 ++
arch/x86/kernel/idt.c | 3 ++
arch/x86/kernel/irq.c | 91 +++++++++++++++++++++++++++++++++
4 files changed, 100 insertions(+)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 72c6a084dba3..6c8daa7518eb 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,6 +44,9 @@ typedef struct {
unsigned int irq_hv_reenlightenment_count;
unsigned int hyperv_stimer0_count;
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ unsigned int posted_msi_notification_count;
+#endif
} ____cacheline_aligned irq_cpustat_t;

DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 05fd175cec7d..f756e761e7c0 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -644,6 +644,9 @@ DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt);
DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt);
DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt);
DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi);
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
+# endif
#endif

#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index b786d48f5a0f..d5840d777469 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -159,6 +159,9 @@ static const __initconst struct idt_data apic_idts[] = {
# endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
#endif
};

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 0bffe8152385..786c2c8330f4 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ",
irq_stats(j)->kvm_posted_intr_wakeup_ipis);
seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
#endif
return 0;
}
@@ -351,6 +358,90 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_interrupt_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
this_cpu_write(posted_interrupt_desc.ndst, this_cpu_read(x86_cpu_to_apicid));
}
+
+static __always_inline inline void handle_pending_pir(struct pi_desc *pid, struct pt_regs *regs)
+{
+ int i, vec = FIRST_EXTERNAL_VECTOR;
+ u64 pir_copy[4];
+
+ /*
+ * Make a copy of PIR which contains IRQ pending bits for vectors,
+ * then invoke IRQ handlers for each pending vector.
+ * If any new interrupts were posted while we are processing, will
+ * do again before allowing new notifications. The idea is to
+ * minimize the number of the expensive notifications if IRQs come
+ * in a high frequency burst.
+ */
+ for (i = 0; i < 4; i++)
+ pir_copy[i] = raw_atomic64_xchg((atomic64_t *)&pid->pir_l[i], 0);
+
+ /*
+ * Ideally, we should start from the high order bits set in the PIR
+ * since each bit represents a vector. Higher order bit position means
+ * the vector has higher priority. But external vectors are allocated
+ * based on availability not priority.
+ *
+ * EOI is included in the IRQ handlers call to apic_ack_irq, which
+ * allows higher priority system interrupt to get in between.
+ */
+ for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)
+ call_irq_handler(vec, regs);
+
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ * Alternatively, could make this tunable, use 3 as default.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+
+ pid = this_cpu_ptr(&posted_interrupt_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
+ irq_enter();
+
+ while (i++ < MAX_POSTED_MSI_COALESCING_LOOP) {
+ handle_pending_pir(pid, regs);
+
+ /*
+ * If there are new interrupts posted in PIR, do again. If
+ * nothing pending, no need to wait for more interrupts.
+ */
+ if (is_pir_pending(pid))
+ continue;
+ else
+ break;
+ }
+
+ /*
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
+
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ if (unlikely(is_pir_pending(pid)))
+ handle_pending_pir(pid, regs);
+
+ apic_eoi();
+ irq_exit();
+ set_irq_regs(old_regs);
+}
#endif /* X86_POSTED_MSI */

#ifdef CONFIG_HOTPLUG_CPU
--
2.25.1