Re: [RFC] vfio/type1: Add IOVA_RANGE capability support

From: Alex Williamson
Date: Thu Dec 07 2017 - 11:08:20 EST


On Wed, 6 Dec 2017 16:07:36 +0000
Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:

> This patch allows the user-space to retrieve the supported
> IOVA range(s), excluding any reserved regions. The implementation
> is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
>
> This is following the discussions here[1] and is based on the RFC patch[2].
>
> ToDo:
> - This currently derives the default supported iova range from the first
> iommu domain. This needs to be changed to go through the domain_list
> instead.
> - Sync with Pierre's patch[3].
>
> 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
> 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-November/019002.html
> 3.https://patchwork.kernel.org/patch/10084655/
>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
> ---
> drivers/vfio/vfio_iommu_type1.c | 172 +++++++++++++++++++++++++++++++++++++++-
> include/uapi/linux/vfio.h | 13 +++
> 2 files changed, 184 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..72ca78a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -28,6 +28,7 @@
> #include <linux/device.h>
> #include <linux/fs.h>
> #include <linux/iommu.h>
> +#include <linux/list_sort.h>
> #include <linux/module.h>
> #include <linux/mm.h>
> #include <linux/rbtree.h>
> @@ -92,6 +93,12 @@ struct vfio_group {
> struct list_head next;
> };
>
> +struct vfio_iommu_iova {
> + struct list_head list;
> + phys_addr_t start;
> + phys_addr_t end;
> +};
> +
> /*
> * Guest RAM pinning working set or DMA target
> */
> @@ -1537,6 +1544,144 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
> return ret;
> }
>
> +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
> +{
> + struct vfio_iommu_type1_info_cap_iova_range *cap;
> + struct vfio_info_cap_header *header;
> +
> + header = vfio_info_cap_add(caps, sizeof(*cap),
> + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> + if (IS_ERR(header))
> + return PTR_ERR(header);
> +
> + cap = container_of(header,
> + struct vfio_iommu_type1_info_cap_iova_range,
> + header);
> +
> + cap->start = start;
> + cap->end = end;
> +
> + return 0;
> +}
> +
> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> + struct list_head *head)
> +{
> + struct vfio_iommu_iova *region;
> +
> + region = kzalloc(sizeof(*region), GFP_KERNEL);

You're initializing every field, so a zero'd allocation is not
necessary here.

> + if (!region)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&region->list);
> + region->start = start;
> + region->end = end;
> +
> + list_add_tail(&region->list, head);
> + return 0;
> +}
> +
> +/*
> + * Check and update iova region list in case a reserved region
> + * overlaps the iommu iova range.
> + */
> +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
> + struct list_head *iova)
> +{
> + struct vfio_iommu_iova *node;
> + phys_addr_t a, b;
> + int ret = 0;
> +
> + if (list_empty(iova))
> + return -ENODEV;
> +
> + node = list_last_entry(iova, struct vfio_iommu_iova, list);
> + a = node->start;
> + b = node->end;
> +
> + /* No overlap */
> + if ((start > b) || (end < a))
> + return 0;
> +
> + if (start > a)
> + ret = vfio_insert_iova(a, start - 1, &node->list);
> + if (ret)
> + goto done;
> + if (end < b)
> + ret = vfio_insert_iova(end + 1, b, &node->list);
> +
> +done:
> + list_del(&node->list);
> + kfree(node);
> +
> + return ret;
> +}
> +
> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> + struct iommu_resv_region *ra, *rb;
> +
> + ra = container_of(a, struct iommu_resv_region, list);
> + rb = container_of(b, struct iommu_resv_region, list);
> +
> + if (ra->start < rb->start)
> + return -1;
> + if (ra->start > rb->start)
> + return 1;
> + return 0;
> +}
> +
> +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> + struct vfio_info_cap *caps)
> +{
> + struct iommu_resv_region *resv, *resv_next;
> + struct vfio_iommu_iova *iova, *iova_next;
> + struct list_head group_resv_regions, vfio_iova_regions;
> + struct vfio_domain *domain;
> + struct vfio_group *g;
> + phys_addr_t start, end;
> + int ret = 0;
> +
> + domain = list_first_entry(&iommu->domain_list,
> + struct vfio_domain, next);

How do you resolve that we can have multiple domains in a container and
each my provide different apertures? Eric noted that the attach group
function attempts to do compatibility checks, so we need to figure out
how we determine IOVA apertures are compatible. The most obvious
answer seems to be that we should look through the dma_list on the
vfio_iommu and determine if there are existing mappings that are
incompatible with the new domain. That also suggests that we should
maintain and update a list of valid iova ranges as we go such that we
can reject mappings outside of those valid ranges.

> + /* Get the default iova range supported */
> + start = domain->domain->geometry.aperture_start;
> + end = domain->domain->geometry.aperture_end;

There's an IOMMU API for this, iommu_domain_get_attr() with
DOMAIN_ATTR_GEOMETRY.

> + INIT_LIST_HEAD(&vfio_iova_regions);
> + vfio_insert_iova(start, end, &vfio_iova_regions);
> +
> + /* Get reserved regions if any */
> + INIT_LIST_HEAD(&group_resv_regions);
> + list_for_each_entry(g, &domain->group_list, next)
> + iommu_get_group_resv_regions(g->iommu_group,
> + &group_resv_regions);

Reserved ranges also need to be accounted for as groups are added to a
domain. Again, if dma list includes any mappings overlapping reserved
ranges (ie. holes in the iova space), the group attach should fail.

> + list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> +
> + /* Update iova range excluding reserved regions */
> + list_for_each_entry(resv, &group_resv_regions, list) {
> + ret = vfio_update_iommu_iova_range(resv->start,
> + resv->start + resv->length - 1,
> + &vfio_iova_regions);
> + if (ret)
> + goto done;
> + }
> +
> + list_for_each_entry(iova, &vfio_iova_regions, list) {
> + ret = vfio_add_iova_cap(caps, iova->start, iova->end);

It seems like a fair bit of overhead and nuisance for the user to have
each iova range added as a separate capability. I think I'd prefer to
see the capability size be based on a number of entries field.

> + if (ret)
> + goto done;
> + }
> +
> +done:
> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> + kfree(resv);
> +
> + list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
> + kfree(iova);
> +
> + return ret;
> +}
> +
> static long vfio_iommu_type1_ioctl(void *iommu_data,
> unsigned int cmd, unsigned long arg)
> {
> @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> }
> } else if (cmd == VFIO_IOMMU_GET_INFO) {
> struct vfio_iommu_type1_info info;
> + struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> + int ret;
>
> - minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
> + minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);

This is incompatible, it will break existing userspace. See
include/uapi/linux/vfio.h:

/*
* Callers of INFO ioctls passing insufficiently sized buffers will see
* the capability chain flag bit set, a zero value for the first capability
* offset (if available within the provided argsz), and argsz will be
* updated to report the necessary buffer size. For compatibility, the
* INFO ioctl will not report error in this case, but the capability chain
* will not be available.
*/

>
> if (copy_from_user(&info, (void __user *)arg, minsz))
> return -EFAULT;
> @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>
> info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
>
> + ret = vfio_build_iommu_iova_caps(iommu, &caps);
> + if (ret)
> + return ret;
> +
> + if (caps.size) {
> + info.flags |= VFIO_IOMMU_INFO_CAPS;
> + if (info.argsz < sizeof(info) + caps.size) {
> + info.argsz = sizeof(info) + caps.size;
> + info.cap_offset = 0;
> + } else {
> + vfio_info_cap_shift(&caps, sizeof(info));
> + if (copy_to_user((void __user *)arg +
> + sizeof(info), caps.buf,
> + caps.size)) {
> + kfree(caps.buf);
> + return -EFAULT;
> + }
> + info.cap_offset = sizeof(info);
> + }
> +
> + kfree(caps.buf);
> + }
> +
> return copy_to_user((void __user *)arg, &info, minsz) ?
> -EFAULT : 0;
>
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301db..c4e338b 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
> __u32 argsz;
> __u32 flags;
> #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
> +#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
> __u64 iova_pgsizes; /* Bitmap of supported page sizes */
> + __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 __resv;

Hmm, I'm not sure why the additional reserved field here is necessary.
I guess we need 8-byte alignment of this iova range capability, but
that should probably be accounted for explicitly as the capability is
constructed in the buffer rather than implicitly by the ending offset
of the static structure.

> +};
> +
> +/*
> + * The IOVA_RANGE capability allows to report the IOVA range(s),
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1
> +struct vfio_iommu_type1_info_cap_iova_range {
> + struct vfio_info_cap_header header;
> + __u64 start;
> + __u64 end;
> };
>

It should be noted that this is version 1 of this structure.

> #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

Thanks,
Alex