Re: [PATCH v2 4/7] vfio/pci: Create persistent INTx handler

From: Eric Auger
Date: Mon Mar 11 2024 - 05:16:44 EST




On 3/9/24 00:05, Alex Williamson wrote:
> A vulnerability exists where the eventfd for INTx signaling can be
> deconfigured, which unregisters the IRQ handler but still allows
> eventfds to be signaled with a NULL context through the SET_IRQS ioctl
> or through unmask irqfd if the device interrupt is pending.
>
> Ideally this could be solved with some additional locking; the igate
> mutex serializes the ioctl and config space accesses, and the interrupt
> handler is unregistered relative to the trigger, but the irqfd path
> runs asynchronous to those. The igate mutex cannot be acquired from the
> atomic context of the eventfd wake function. Disabling the irqfd
> relative to the eventfd registration is potentially incompatible with
> existing userspace.
>
> As a result, the solution implemented here moves configuration of the
> INTx interrupt handler to track the lifetime of the INTx context object
> and irq_type configuration, rather than registration of a particular
> trigger eventfd. Synchronization is added between the ioctl path and
> eventfd_signal() wrapper such that the eventfd trigger can be
> dynamically updated relative to in-flight interrupts or irqfd callbacks.
>
> Cc: stable@xxxxxxxxxxxxxxx
> Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver")
> Reported-by: Reinette Chatre <reinette.chatre@xxxxxxxxx>
> Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx>
> Reviewed-by: Reinette Chatre <reinette.chatre@xxxxxxxxx>
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
Reviewed-by: Eric Auger <eric.auger@xxxxxxxxxx>

Eric
> ---
> drivers/vfio/pci/vfio_pci_intrs.c | 145 ++++++++++++++++--------------
> 1 file changed, 78 insertions(+), 67 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 75c85eec21b3..fb5392b749ff 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -90,11 +90,15 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
>
> if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
> struct vfio_pci_irq_ctx *ctx;
> + struct eventfd_ctx *trigger;
>
> ctx = vfio_irq_ctx_get(vdev, 0);
> if (WARN_ON_ONCE(!ctx))
> return;
> - eventfd_signal(ctx->trigger);
> +
> + trigger = READ_ONCE(ctx->trigger);
> + if (likely(trigger))
> + eventfd_signal(trigger);
> }
> }
>
> @@ -253,100 +257,100 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
> return ret;
> }
>
> -static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
> +static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
> + struct eventfd_ctx *trigger)
> {
> + struct pci_dev *pdev = vdev->pdev;
> struct vfio_pci_irq_ctx *ctx;
> + unsigned long irqflags;
> + char *name;
> + int ret;
>
> if (!is_irq_none(vdev))
> return -EINVAL;
>
> - if (!vdev->pdev->irq)
> + if (!pdev->irq)
> return -ENODEV;
>
> + name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
> + if (!name)
> + return -ENOMEM;
> +
> ctx = vfio_irq_ctx_alloc(vdev, 0);
> if (!ctx)
> return -ENOMEM;
>
> + ctx->name = name;
> + ctx->trigger = trigger;
> +
> /*
> - * If the virtual interrupt is masked, restore it. Devices
> - * supporting DisINTx can be masked at the hardware level
> - * here, non-PCI-2.3 devices will have to wait until the
> - * interrupt is enabled.
> + * Fill the initial masked state based on virq_disabled. After
> + * enable, changing the DisINTx bit in vconfig directly changes INTx
> + * masking. igate prevents races during setup, once running masked
> + * is protected via irqlock.
> + *
> + * Devices supporting DisINTx also reflect the current mask state in
> + * the physical DisINTx bit, which is not affected during IRQ setup.
> + *
> + * Devices without DisINTx support require an exclusive interrupt.
> + * IRQ masking is performed at the IRQ chip. Again, igate protects
> + * against races during setup and IRQ handlers and irqfds are not
> + * yet active, therefore masked is stable and can be used to
> + * conditionally auto-enable the IRQ.
> + *
> + * irq_type must be stable while the IRQ handler is registered,
> + * therefore it must be set before request_irq().
> */
> ctx->masked = vdev->virq_disabled;
> - if (vdev->pci_2_3)
> - pci_intx(vdev->pdev, !ctx->masked);
> + if (vdev->pci_2_3) {
> + pci_intx(pdev, !ctx->masked);
> + irqflags = IRQF_SHARED;
> + } else {
> + irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0;
> + }
>
> vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
>
> + ret = request_irq(pdev->irq, vfio_intx_handler,
> + irqflags, ctx->name, vdev);
> + if (ret) {
> + vdev->irq_type = VFIO_PCI_NUM_IRQS;
> + kfree(name);
> + vfio_irq_ctx_free(vdev, ctx, 0);
> + return ret;
> + }
> +
> return 0;
> }
>
> -static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
> +static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev,
> + struct eventfd_ctx *trigger)
> {
> struct pci_dev *pdev = vdev->pdev;
> - unsigned long irqflags = IRQF_SHARED;
> struct vfio_pci_irq_ctx *ctx;
> - struct eventfd_ctx *trigger;
> - unsigned long flags;
> - int ret;
> + struct eventfd_ctx *old;
>
> ctx = vfio_irq_ctx_get(vdev, 0);
> if (WARN_ON_ONCE(!ctx))
> return -EINVAL;
>
> - if (ctx->trigger) {
> - free_irq(pdev->irq, vdev);
> - kfree(ctx->name);
> - eventfd_ctx_put(ctx->trigger);
> - ctx->trigger = NULL;
> - }
> -
> - if (fd < 0) /* Disable only */
> - return 0;
> -
> - ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)",
> - pci_name(pdev));
> - if (!ctx->name)
> - return -ENOMEM;
> -
> - trigger = eventfd_ctx_fdget(fd);
> - if (IS_ERR(trigger)) {
> - kfree(ctx->name);
> - return PTR_ERR(trigger);
> - }
> + old = ctx->trigger;
>
> - ctx->trigger = trigger;
> + WRITE_ONCE(ctx->trigger, trigger);
>
> - /*
> - * Devices without DisINTx support require an exclusive interrupt,
> - * IRQ masking is performed at the IRQ chip. The masked status is
> - * protected by vdev->irqlock. Setup the IRQ without auto-enable and
> - * unmask as necessary below under lock. DisINTx is unmodified by
> - * the IRQ configuration and may therefore use auto-enable.
> - */
> - if (!vdev->pci_2_3)
> - irqflags = IRQF_NO_AUTOEN;
> -
> - ret = request_irq(pdev->irq, vfio_intx_handler,
> - irqflags, ctx->name, vdev);
> - if (ret) {
> - ctx->trigger = NULL;
> - kfree(ctx->name);
> - eventfd_ctx_put(trigger);
> - return ret;
> + /* Releasing an old ctx requires synchronizing in-flight users */
> + if (old) {
> + synchronize_irq(pdev->irq);
> + vfio_virqfd_flush_thread(&ctx->unmask);
> + eventfd_ctx_put(old);
> }
>
> - spin_lock_irqsave(&vdev->irqlock, flags);
> - if (!vdev->pci_2_3 && !ctx->masked)
> - enable_irq(pdev->irq);
> - spin_unlock_irqrestore(&vdev->irqlock, flags);
> -
> return 0;
> }
>
> static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
> {
> + struct pci_dev *pdev = vdev->pdev;
> struct vfio_pci_irq_ctx *ctx;
>
> ctx = vfio_irq_ctx_get(vdev, 0);
> @@ -354,10 +358,13 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
> if (ctx) {
> vfio_virqfd_disable(&ctx->unmask);
> vfio_virqfd_disable(&ctx->mask);
> + free_irq(pdev->irq, vdev);
> + if (ctx->trigger)
> + eventfd_ctx_put(ctx->trigger);
> + kfree(ctx->name);
> + vfio_irq_ctx_free(vdev, ctx, 0);
> }
> - vfio_intx_set_signal(vdev, -1);
> vdev->irq_type = VFIO_PCI_NUM_IRQS;
> - vfio_irq_ctx_free(vdev, ctx, 0);
> }
>
> /*
> @@ -641,19 +648,23 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> return -EINVAL;
>
> if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
> + struct eventfd_ctx *trigger = NULL;
> int32_t fd = *(int32_t *)data;
> int ret;
>
> - if (is_intx(vdev))
> - return vfio_intx_set_signal(vdev, fd);
> + if (fd >= 0) {
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger))
> + return PTR_ERR(trigger);
> + }
>
> - ret = vfio_intx_enable(vdev);
> - if (ret)
> - return ret;
> + if (is_intx(vdev))
> + ret = vfio_intx_set_signal(vdev, trigger);
> + else
> + ret = vfio_intx_enable(vdev, trigger);
>
> - ret = vfio_intx_set_signal(vdev, fd);
> - if (ret)
> - vfio_intx_disable(vdev);
> + if (ret && trigger)
> + eventfd_ctx_put(trigger);
>
> return ret;
> }