Re: [PATCH v12 3/7] genirq: Add mechanism to multiplex a single HW IPI

From: Anup Patel
Date: Mon Nov 28 2022 - 06:13:46 EST


On Mon, Nov 28, 2022 at 4:04 PM Marc Zyngier <maz@xxxxxxxxxx> wrote:
>
> On Sat, 26 Nov 2022 17:34:49 +0000,
> Anup Patel <apatel@xxxxxxxxxxxxxxxx> wrote:
> >
> > All RISC-V platforms have a single HW IPI provided by the INTC local
> > interrupt controller. The HW method to trigger INTC IPI can be through
> > external irqchip (e.g. RISC-V AIA), through platform specific device
> > (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> >
> > To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> > mechanism which help us create multiple virtual IPIs using a single
> > HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> > irqchip driver and it is shared by various RISC-V irqchip drivers.
> >
> > Signed-off-by: Anup Patel <apatel@xxxxxxxxxxxxxxxx>
> > ---
> > include/linux/irq.h | 4 +
> > kernel/irq/Kconfig | 5 ++
> > kernel/irq/Makefile | 1 +
> > kernel/irq/ipi-mux.c | 210 +++++++++++++++++++++++++++++++++++++++++++
> > 4 files changed, 220 insertions(+)
> > create mode 100644 kernel/irq/ipi-mux.c
> >
> > diff --git a/include/linux/irq.h b/include/linux/irq.h
> > index c3eb89606c2b..6024e1ee1257 100644
> > --- a/include/linux/irq.h
> > +++ b/include/linux/irq.h
> > @@ -1266,6 +1266,10 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
> > int ipi_send_single(unsigned int virq, unsigned int cpu);
> > int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
> >
> > +void ipi_mux_process(void);
> > +int ipi_mux_create(unsigned int nr_ipi,
> > + void (*mux_send)(const struct cpumask *));
> > +
> > #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
> > /*
> > * Registers a generic IRQ handling function as the top-level IRQ handler in
> > diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> > index db3d174c53d4..df17dbc54b02 100644
> > --- a/kernel/irq/Kconfig
> > +++ b/kernel/irq/Kconfig
> > @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
> > depends on SMP
> > select IRQ_DOMAIN_HIERARCHY
> >
> > +# Generic IRQ IPI Mux support
> > +config GENERIC_IRQ_IPI_MUX
> > + bool
> > + depends on SMP
> > +
> > # Generic MSI interrupt support
> > config GENERIC_MSI_IRQ
> > bool
> > diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> > index b4f53717d143..f19d3080bf11 100644
> > --- a/kernel/irq/Makefile
> > +++ b/kernel/irq/Makefile
> > @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
> > obj-$(CONFIG_PM_SLEEP) += pm.o
> > obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
> > obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> > +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
> > obj-$(CONFIG_SMP) += affinity.o
> > obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
> > obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> > diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> > new file mode 100644
> > index 000000000000..366d8cd5320b
> > --- /dev/null
> > +++ b/kernel/irq/ipi-mux.c
> > @@ -0,0 +1,210 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Multiplex several virtual IPIs over a single HW IPI.
> > + *
> > + * Copyright The Asahi Linux Contributors
> > + * Copyright (c) 2022 Ventana Micro Systems Inc.
> > + */
> > +
> > +#define pr_fmt(fmt) "ipi-mux: " fmt
> > +#include <linux/cpu.h>
> > +#include <linux/init.h>
> > +#include <linux/irq.h>
> > +#include <linux/irqchip.h>
> > +#include <linux/irqchip/chained_irq.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/jump_label.h>
> > +#include <linux/percpu.h>
> > +#include <linux/smp.h>
> > +
> > +struct ipi_mux_cpu {
> > + atomic_t enable;
> > + atomic_t bits;
> > + struct cpumask send_mask;
> > +};
> > +
> > +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
> > +static struct irq_domain *ipi_mux_domain;
> > +static void (*ipi_mux_send)(const struct cpumask *mask);
> > +
> > +static void ipi_mux_mask(struct irq_data *d)
> > +{
> > + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +
> > + atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> > +}
> > +
> > +static void ipi_mux_unmask(struct irq_data *d)
> > +{
> > + u32 ibit = BIT(irqd_to_hwirq(d));
> > + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +
> > + atomic_or(ibit, &icpu->enable);
> > +
> > + /*
> > + * The atomic_or() above must complete before the atomic_read()
> > + * below to avoid racing ipi_mux_send_mask().
> > + */
> > + smp_mb__after_atomic();
> > +
> > + /* If a pending IPI was unmasked, raise a parent IPI immediately. */
> > + if (atomic_read(&icpu->bits) & ibit)
> > + ipi_mux_send(cpumask_of(smp_processor_id()));
> > +}
> > +
> > +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> > +{
> > + u32 ibit = BIT(irqd_to_hwirq(d));
> > + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > + struct cpumask *send_mask = &icpu->send_mask;
> > + unsigned long flags;
> > + int cpu;
> > +
> > + /*
> > + * We use send_mask as a per-CPU variable so disable local
> > + * interrupts to avoid being preempted.
> > + */
> > + local_irq_save(flags);
>
> The correct way to avoid preemption is to use preempt_disable(), which
> is a lot cheaper than disabling interrupt on most architectures.

Okay, I will update.

>
> > +
> > + cpumask_clear(send_mask);
>
> This thing is likely to be unnecessarily expensive on very large
> systems, as it is proportional to the number of CPUs.
>
> > +
> > + for_each_cpu(cpu, mask) {
> > + icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> > + atomic_or(ibit, &icpu->bits);
>
> The original code had an atomic_fetch_or_release() to allow eliding
> the IPI if the target interrupt was already pending. Why is that code
> gone? This is a pretty cheap and efficient optimisation.

That optimization is causing RCU stalls on QEMU RISC-V virt
machine with large number of CPUs.

>
> > +
> > + /*
> > + * The atomic_or() above must complete before
> > + * the atomic_read() below to avoid racing with
> > + * ipi_mux_unmask().
> > + */
> > + smp_mb__after_atomic();
> > +
> > + if (atomic_read(&icpu->enable) & ibit)
> > + cpumask_set_cpu(cpu, send_mask);
> > + }
> > +
> > + /* Trigger the parent IPI */
> > + ipi_mux_send(send_mask);
>
> IPIs are very rarely made pending on more than a single CPU at a
> time. The overwhelming majority of them are targeting a single CPU. So
> accumulating bits to avoid doing two or more "send" actions only
> penalises the generic case.
>
> My conclusion is that this "send_mask" can probably be removed,
> together with the preemption fiddling.

So, we should call ipi_mux_send() for one target CPU at a time ?

>
> > +
> > + local_irq_restore(flags);
> > +}
> > +
> > +static const struct irq_chip ipi_mux_chip = {
> > + .name = "IPI Mux",
> > + .irq_mask = ipi_mux_mask,
> > + .irq_unmask = ipi_mux_unmask,
> > + .ipi_send_mask = ipi_mux_send_mask,
> > +};
>
> OK, you have now dropped the superfluous pre/post handlers. But the
> need still exists. Case in point, the aic_handle_ipi() prologue and
> epilogue to the interrupt handling. I have suggested last time that
> the driver could provide the actual struct irq_chip in order to
> provide the callbacks it requires.

The aic_handle_ipi() can simply call ipi_mux_process() between
the prologue and epilogue.

>
> Please realise that I will not take this patch if this cannot be made
> to work with the single existing in-tree instance of an IPI MUX. 90%
> of the code having been lifted from there, I think this is a pretty
> fair ask.

Only the muxing part of AIC has been factored out. All the register
programming of AIC will remain in the AIA irqchip driver without any
change in sequence.

>
> > +
> > +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> > + unsigned int nr_irqs, void *arg)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < nr_irqs; i++) {
> > + irq_set_percpu_devid(virq + i);
> > + irq_domain_set_info(d, virq + i, i,
> > + &ipi_mux_chip, d->host_data,
>
> What does d->host_data represent here?

It's always NULL so we don't need to pass it. I will update.

>
> > + handle_percpu_devid_irq, NULL, NULL);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static const struct irq_domain_ops ipi_mux_domain_ops = {
> > + .alloc = ipi_mux_domain_alloc,
> > + .free = irq_domain_free_irqs_top,
> > +};
> > +
> > +/**
> > + * ipi_mux_process - Process multiplexed virtual IPIs
> > + */
> > +void ipi_mux_process(void)
> > +{
> > + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > + irq_hw_number_t hwirq;
> > + unsigned long ipis;
> > + unsigned int en;
> > +
> > + /*
> > + * Reading enable mask does not need to be ordered as long as
> > + * this function called from interrupt handler because only
> > + * the CPU itself can change it's own enable mask.
> > + */
> > + en = atomic_read(&icpu->enable);
> > +
> > + /*
> > + * Clear the IPIs we are about to handle. This pairs with the
> > + * atomic_fetch_or_release() in ipi_mux_send_mask().
> > + */
> > + ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> > +
> > + for_each_set_bit(hwirq, &ipis, BITS_PER_LONG)
>
> BITS_PER_LONG...

Argh, I should have used BITS_PER_TYPE(int) here. I will update.

>
> > + generic_handle_domain_irq(ipi_mux_domain, hwirq);
> > +}
> > +
> > +/**
> > + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> > + * parent IPI.
> > + * @nr_ipi: number of virtual IPIs to create. This should
> > + * be <= BITS_PER_TYPE(int)
> > + * @mux_send: callback to trigger parent IPI
> > + *
> > + * Returns first virq of the newly created virtual IPIs upon success
> > + * or <=0 upon failure
> > + */
> > +int ipi_mux_create(unsigned int nr_ipi,
> > + void (*mux_send)(const struct cpumask *))
> > +{
> > + struct fwnode_handle *fwnode;
> > + struct irq_domain *domain;
> > + int rc;
> > +
> > + if (ipi_mux_domain)
> > + return -EEXIST;
> > +
> > + if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
>
> ... vs BITS_PER_TYPE(int) ...
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.

Regards,
Anup