Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops

From: Jason Gunthorpe
Date: Thu Sep 01 2022 - 16:37:32 EST


On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote:
> On 9/1/22 6:25 AM, Robin Murphy wrote:
> > On 2022-08-31 21:12, Matthew Rosato wrote:
> >> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> >> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
> >> domains and the DMA API handling.  However, this commit does not
> >> sufficiently handle the case where the device is released via a call
> >> to the release_device op as it may occur at the same time as an opposing
> >> attach_dev or detach_dev since the group mutex is not held over
> >> release_device.  This was observed if the device is deconfigured during a
> >> small window during vfio-pci initialization and can result in WARNs and
> >> potential kernel panics.
> >
> > Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
> >
> > Robin.
>
> So, I generally have seen the issue manifest as one of the calls
> into the iommu core from __vfio_group_unset_container
> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN.
> This happens when the vfio group fd is released, which could be
> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER.
> AFAICT there's nothing serializing the notion of calling into the
> iommu core here against a device that is simultaneously going
> through release_device (because we don't enter release_device with
> the group mutex held), resulting in unpredictable behavior between
> the dueling attach_dev/detach_dev and the release_device for
> s390-iommu at least.

Oh, this is a vfio bug.

dev->iommu_group is only a valid pointer as long as a driver is attach
to the device.

vfio copies the dev->iommu_group into struct vfio_group during probe()
but then lets vfio_group live independently. Particularly the driver
can be removed()'d and the vfio_group keeps on going.

Thus it is possible to UAF the iommu_group by referencing it through
the vfio_group.

We must wait during remove for all the vfio_groups to stop
referencing iommu_group.

Something like this or so:

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index eb714a484662fc..d8f40b22c98ddb 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -65,7 +65,15 @@ struct vfio_container {
struct vfio_group {
struct device dev;
struct cdev cdev;
+ /*
+ * When drivers is non-zero a driver is attached to the struct device
+ * that provided the iommu_group and thus the iommu_group is a valid
+ * pointer. When drivers is 0 the driver is being detached. Once users
+ * reaches 0 then the iommu_group is invalid.
+ */
+ refcount_t drivers;
refcount_t users;
+ struct completion comp;
unsigned int container_users;
struct iommu_group *iommu_group;
struct vfio_container *container;
@@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
}
EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);

-static void vfio_group_get(struct vfio_group *group);
-
/*
* Container objects - containers are created when /dev/vfio/vfio is
* opened, but their lifecycle extends until the last user is done, so
@@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container)
/*
* Group objects - create, release, get, put, search
*/
+
+ /*
+ * This returns a driver reference. It can only be used in the probe function
+ * of a device_driver, eg as part of the internal implementation of
+ * __vfio_register_dev().
+ */
static struct vfio_group *
__vfio_group_get_from_iommu(struct iommu_group *iommu_group)
{
struct vfio_group *group;

list_for_each_entry(group, &vfio.group_list, vfio_next) {
- if (group->iommu_group == iommu_group) {
- vfio_group_get(group);
+ if (group->iommu_group == iommu_group &&
+ refcount_inc_not_zero(&group->drivers))
return group;
- }
}
return NULL;
}
@@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
group->cdev.owner = THIS_MODULE;

refcount_set(&group->users, 1);
+ refcount_set(&group->drivers, 1);
+ init_completion(&group->comp);
init_rwsem(&group->group_rwsem);
INIT_LIST_HEAD(&group->device_list);
mutex_init(&group->device_lock);
@@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,

static void vfio_group_put(struct vfio_group *group)
{
- if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
- return;
+ if (refcount_dec_and_test(&group->users))
+ complete(&group->comp);
+}
+
+/*
+ * When the drivers count reaches 0 then the group must be destroyed
+ * immediately. A zero driver group is a zombie awaiting destruction.
+ */
+static void vfio_group_remove(struct vfio_group *group)
+{
+ /* Matches the get from vfio_group_alloc() */
+ vfio_group_put(group);
+
+ cdev_device_del(&group->cdev, &group->dev);
+
+ /*
+ * Before we allow the last driver in the group to be unplugged the
+ * group must be sanitized so nothing else is or can reference it. This
+ * is because the group->iommu_group pointer is only valid so long as a
+ * VFIO device driver is attached to a device belonging to the group.
+ */
+ wait_for_completion(&group->comp);

/*
* These data structures all have paired operations that can only be
@@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group)
WARN_ON(!list_empty(&group->device_list));
WARN_ON(group->container || group->container_users);
WARN_ON(group->notifier.head);
+ group->iommu_group = NULL;

+ mutex_lock(&vfio.group_lock);
list_del(&group->vfio_next);
- cdev_device_del(&group->cdev, &group->dev);
mutex_unlock(&vfio.group_lock);

put_device(&group->dev);
}

-static void vfio_group_get(struct vfio_group *group)
-{
- refcount_inc(&group->users);
-}
-
/*
* Device objects - create, release, get, put, search
*/
@@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device)
return refcount_inc_not_zero(&device->refcount);
}

-static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
- struct device *dev)
-{
- struct vfio_device *device;
-
- mutex_lock(&group->device_lock);
- list_for_each_entry(device, &group->device_list, group_next) {
- if (device->dev == dev && vfio_device_try_get(device)) {
- mutex_unlock(&group->device_lock);
- return device;
- }
- }
- mutex_unlock(&group->device_lock);
- return NULL;
-}
-
/*
* VFIO driver API
*/
@@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
static int __vfio_register_dev(struct vfio_device *device,
struct vfio_group *group)
{
- struct vfio_device *existing_device;
-
+ /*
+ * In all cases group is the output of one of the group allocation functions
+ * and we have group->drivers incremetned for us
+ */
if (IS_ERR(group))
return PTR_ERR(group);

@@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device,
if (!device->dev_set)
vfio_assign_device_set(device, device);

- existing_device = vfio_group_get_device(group, device->dev);
- if (existing_device) {
- dev_WARN(device->dev, "Device already exists on group %d\n",
- iommu_group_id(group->iommu_group));
- vfio_device_put(existing_device);
- if (group->type == VFIO_NO_IOMMU ||
- group->type == VFIO_EMULATED_IOMMU)
- iommu_group_remove_device(device->dev);
- vfio_group_put(group);
- return -EBUSY;
- }
-
/* Our reference on group is moved to the device */
device->group = group;

@@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device)
if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
iommu_group_remove_device(device->dev);

- /* Matches the get in vfio_register_group_dev() */
- vfio_group_put(group);
+ /* Matches the alloc get in vfio_register_group_dev() */
+ if (refcount_dec_and_test(&group->drivers))
+ vfio_group_remove(group);
}
EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);