Re: [RFC] vfio/type1: Add IOVA_RANGE capability support

From: Pierre Morel
Date: Thu Dec 07 2017 - 07:55:47 EST


On 06/12/2017 17:15, Shameerali Kolothum Thodi wrote:
Hi Pierre,

-----Original Message-----
From: Shameerali Kolothum Thodi
Sent: Wednesday, December 06, 2017 4:08 PM
To: alex.williamson@xxxxxxxxxx; eric.auger@xxxxxxxxxx;
pmorel@xxxxxxxxxxxxxxxxxx
Cc: kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Linuxarm
<linuxarm@xxxxxxxxxx>; Shameerali Kolothum Thodi
<shameerali.kolothum.thodi@xxxxxxxxxx>
Subject: [RFC] vfio/type1: Add IOVA_RANGE capability support

This patch allows the user-space to retrieve the supported
IOVA range(s), excluding any reserved regions. The implementation
is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.

This is following the discussions here[1] and is based on the RFC patch[2].

ToDo:
- This currently derives the default supported iova range from the first
iommu domain. This needs to be changed to go through the domain_list
instead.
- Sync with Pierre's patch[3].

Thanks to Eric[1], came to know that you have posted a patch to retrieve the
iommu aperture info. This RFC does a similar thing but try to take care of
any reserved regions and adds to the capability chain.

Please take a look and if there is a possibility to sync up your next revision
and this, please let me know.

Hi Shameer,

Indeed it is close to what I was developing.

I have a single concern, the aperture is strongly hardware related while reserved areas are much more flexible.

As a consequence, I think it would be better if they were handled in separate capabilities and let the user space decide if it wants to know about one or the other.

For my immediate needs, this patch would be OK since we (s390x) do not use reserved regions.

@Alex: what do you prefer
If we need two capabilities, I will send the patch serie I made on the aperture capability for VFIO IOMMU.
If not I will use Shameer's patch.

Thanks,

Best regards,

Pierre


Thanks,
Shameer

1. https://patchwork.kernel.org/patch/10056967/

1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
2.https://lists.linuxfoundation.org/pipermail/iommu/2016-
November/019002.html
3.https://patchwork.kernel.org/patch/10084655/

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
---
drivers/vfio/vfio_iommu_type1.c | 172
+++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/vfio.h | 13 +++
2 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..72ca78a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/iommu.h>
+#include <linux/list_sort.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/rbtree.h>
@@ -92,6 +93,12 @@ struct vfio_group {
struct list_head next;
};

+struct vfio_iommu_iova {
+ struct list_head list;
+ phys_addr_t start;
+ phys_addr_t end;
+};
+
/*
* Guest RAM pinning working set or DMA target
*/
@@ -1537,6 +1544,144 @@ static int vfio_domains_have_iommu_cache(struct
vfio_iommu *iommu)
return ret;
}

+static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 end)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap;
+ struct vfio_info_cap_header *header;
+
+ header = vfio_info_cap_add(caps, sizeof(*cap),
+ VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+ if (IS_ERR(header))
+ return PTR_ERR(header);
+
+ cap = container_of(header,
+ struct vfio_iommu_type1_info_cap_iova_range,
+ header);
+
+ cap->start = start;
+ cap->end = end;
+
+ return 0;
+}
+
+static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
+ struct list_head *head)
+{
+ struct vfio_iommu_iova *region;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&region->list);
+ region->start = start;
+ region->end = end;
+
+ list_add_tail(&region->list, head);
+ return 0;
+}
+
+/*
+ * Check and update iova region list in case a reserved region
+ * overlaps the iommu iova range.
+ */
+static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t end,
+ struct list_head *iova)
+{
+ struct vfio_iommu_iova *node;
+ phys_addr_t a, b;
+ int ret = 0;
+
+ if (list_empty(iova))
+ return -ENODEV;
+
+ node = list_last_entry(iova, struct vfio_iommu_iova, list);
+ a = node->start;
+ b = node->end;
+
+ /* No overlap */
+ if ((start > b) || (end < a))
+ return 0;
+
+ if (start > a)
+ ret = vfio_insert_iova(a, start - 1, &node->list);
+ if (ret)
+ goto done;
+ if (end < b)
+ ret = vfio_insert_iova(end + 1, b, &node->list);
+
+done:
+ list_del(&node->list);
+ kfree(node);
+
+ return ret;
+}
+
+static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct iommu_resv_region *ra, *rb;
+
+ ra = container_of(a, struct iommu_resv_region, list);
+ rb = container_of(b, struct iommu_resv_region, list);
+
+ if (ra->start < rb->start)
+ return -1;
+ if (ra->start > rb->start)
+ return 1;
+ return 0;
+}
+
+static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
+ struct vfio_info_cap *caps)
+{
+ struct iommu_resv_region *resv, *resv_next;
+ struct vfio_iommu_iova *iova, *iova_next;
+ struct list_head group_resv_regions, vfio_iova_regions;
+ struct vfio_domain *domain;
+ struct vfio_group *g;
+ phys_addr_t start, end;
+ int ret = 0;
+
+ domain = list_first_entry(&iommu->domain_list,
+ struct vfio_domain, next);
+ /* Get the default iova range supported */
+ start = domain->domain->geometry.aperture_start;
+ end = domain->domain->geometry.aperture_end;
+ INIT_LIST_HEAD(&vfio_iova_regions);
+ vfio_insert_iova(start, end, &vfio_iova_regions);
+
+ /* Get reserved regions if any */
+ INIT_LIST_HEAD(&group_resv_regions);
+ list_for_each_entry(g, &domain->group_list, next)
+ iommu_get_group_resv_regions(g->iommu_group,
+ &group_resv_regions);
+ list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
+
+ /* Update iova range excluding reserved regions */
+ list_for_each_entry(resv, &group_resv_regions, list) {
+ ret = vfio_update_iommu_iova_range(resv->start,
+ resv->start + resv->length - 1,
+ &vfio_iova_regions);
+ if (ret)
+ goto done;
+ }
+
+ list_for_each_entry(iova, &vfio_iova_regions, list) {
+ ret = vfio_add_iova_cap(caps, iova->start, iova->end);
+ if (ret)
+ goto done;
+ }
+
+done:
+ list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
+ kfree(resv);
+
+ list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
+ kfree(iova);
+
+ return ret;
+}
+
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
@@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void
*iommu_data,
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ int ret;

- minsz = offsetofend(struct vfio_iommu_type1_info,
iova_pgsizes);
+ minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);

if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
@@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void
*iommu_data,

info.iova_pgsizes = vfio_pgsize_bitmap(iommu);

+ ret = vfio_build_iommu_iova_caps(iommu, &caps);
+ if (ret)
+ return ret;
+
+ if (caps.size) {
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ info.cap_offset = 0;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+
+ kfree(caps.buf);
+ }
+
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301db..c4e338b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
__u32 argsz;
__u32 flags;
#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
+#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
__u64 iova_pgsizes; /* Bitmap of supported page sizes */
+ __u32 cap_offset; /* Offset within info struct of first cap */
+ __u32 __resv;
+};
+
+/*
+ * The IOVA_RANGE capability allows to report the IOVA range(s),
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1
+struct vfio_iommu_type1_info_cap_iova_range {
+ struct vfio_info_cap_header header;
+ __u64 start;
+ __u64 end;
};

#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
--
1.9.1




--
Pierre Morel
Linux/KVM/QEMU in BÃblingen - Germany