Re: [PATCH rc 3/8] iommu/vt-d: Add missing iotlb flush for parent domain

From: Jason Gunthorpe
Date: Thu Feb 22 2024 - 10:16:40 EST


On Thu, Feb 22, 2024 at 04:34:10PM +0800, Yi Liu wrote:
> > It doesn't mean that the S2 is globally shared across all the nesting
> > translations (like ARM does), and you still have to iterate over every
> > nesting DID.
> >
> > In light of that this design seems to have gone a bit off..
> >
> > A domain should have a list of places that need invalidation,
> > specifically a list of DIDs and ATCs that need an invalidation to be
> > issued.
> >
> > Instead we now somehow have 4 different lists in the domain the
> > invalidation code iterates over?
> >
> > So I would think this:
> >
> > struct dmar_domain {
> > struct xarray iommu_array; /* Attached IOMMU array */
> > struct list_head devices; /* all devices' list */
> > struct list_head dev_pasids; /* all attached pasids */
> > struct list_head s1_domains;
> >
> > Would make sense to be collapsed into one logical list of attached
> > devices:
> >
> > struct intel_iommu_domain_attachment {
> > unsigned int did;
> > ioasid_t pasid;
> > struct device_domain_info *info;
> > list_head item;
> > };
> >
> > When you attach a S1/S2 nest you allocate two of the above structs and
> > one is linked on the S1 and one is linked on the S2..
>
> yes, this shall work. ATC flushing should be fine. But there can be a
> drawback when flushing DIDs. VT-d side, if there are multiple devices
> behind the same iommu unit, just need one DID flushing is enough. But
> after the above change, the number of DID flushing would increase per
> the number of devices. Although no functional issue, but it submits
> duplicated invalidation.

At least the three server drivers all have this same problem, I would
like to seem some core code to help solve it, since it is tricky and
the RCU idea is good..

Collapsing invalidations can be done by sorting, I think this was
Robin's suggestion. But we could also easially maintain multiple list
threads hidden inside the API, or allocate a multi-level list.

Something very approximately like this:

Jason

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d14413916f93a0..7b2de139e7c437 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -38,6 +38,8 @@

#include "iommu-sva.h"

+#include <linux/iommu-alist.h>
+
static struct kset *iommu_group_kset;
static DEFINE_IDA(iommu_group_ida);
static DEFINE_IDA(iommu_global_pasid_ida);
diff --git a/include/linux/iommu-alist.h b/include/linux/iommu-alist.h
new file mode 100644
index 00000000000000..7a9243f8b2b5f8
--- /dev/null
+++ b/include/linux/iommu-alist.h
@@ -0,0 +1,126 @@
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/iommu.h>
+
+/*
+ * Place in an iommu_domain to keep track of what devices have been attached to
+ * it.
+ */
+struct iommu_attach_list {
+ spinlock_t lock;
+ struct list_head all;
+};
+
+/*
+ * Allocate for every time a device is attached to a domain
+ */
+struct iommu_attach_elm {
+ /*
+ * Note that this pointer is accessed under RCU, the driver has to be
+ * careful to rcu free it.
+ */
+ struct iommu_device *iommu_device;
+ ioasid_t pasid;
+ u8 using_atc;
+ struct list_head all_item;
+
+ /* May not be accessed under RCU! */
+ struct device *device;
+};
+
+void iommu_attach_init(struct iommu_attach_list *alist)
+{
+ spin_lock_init(&alist->lock);
+}
+
+int iommu_attach_add(struct iommu_attach_list *alist,
+ struct iommu_attach_elm *elm)
+{
+ struct list_head *insert_after;
+
+ spin_lock(&alist->lock);
+ insert_after = list_find_sorted_insertion(alist, elm, cmp);
+ list_add_rcu(&elm->all_item, insert_after);
+ spin_unlock(&alist->lock);
+}
+
+void iommu_attach_del_free(struct iommu_attach_list *alist, struct iommu_attach_elm *elm)
+{
+ spin_lock(&alist->lock);
+ list_del_rcu(&elm->all_item);
+ spin_unlock(&alist->lock);
+ /* assumes 0 offset */
+ kfree_rcu_mightsleep(elm);
+}
+
+static struct iommu_attach_elm *
+__iommu_attach_next_iommu(struct iommu_attach_list *alist,
+ struct iommu_attach_elm *pos,
+ struct iommu_attach_elm *last)
+{
+ struct iommu_attach_elm *next;
+
+ do {
+ next = list_next_or_null_rcu(&alist->all, &pos->all_item,
+ struct iommu_attach_elm, all_item);
+ if (!next)
+ return NULL;
+ if (!last)
+ return next;
+ } while (pos->iommu_device == next->iommu_device);
+ return next;
+}
+
+/* assumes 0 offset */
+#define iommu_attach_next_iommu(alist, pos, last, member) \
+ container_of(__iommu_attach_next_iommu(alist, &(pos)->member, \
+ &(last)->member), \
+ typeof(*pos), member)
+
+#define iommu_attach_for_each_iommu(pos, last, alist, member) \
+ for (({ \
+ RCU_LOCKDEP_WARN( \
+ !rcu_read_lock_any_held(), \
+ "RCU-list traversed in non-reader section!"); \
+ }), \
+ pos = list_first_or_null_rcu(&(alist)->all, typeof(*pos), \
+ member.all_item), \
+ last = NULL; \
+ pos; pos = iommu_attach_next_iommu(alist, pos, last, member))
+
+/* Example */
+struct driver_domain {
+ struct iommu_domain domain;
+ struct iommu_attach_list alist;
+};
+
+struct driver_attach_elm {
+ struct iommu_attach_elm aelm;
+ unsigned int cache_tag;
+};
+
+static void example(struct driver_domain *domain)
+{
+ struct driver_attach_elm *elm;
+ struct driver_attach_elm *pos, *last;
+ bool need_atc;
+
+ iommu_attach_init(&domain->alist);
+
+ elm = kzalloc(sizeof(*elm), GFP_KERNEL);
+ iommu_attach_add(&domain->alist, &elm->aelm);
+
+ rcu_read_lock();
+ iommu_attach_for_each_iommu(pos, last, &domain->alist, aelm) {
+ invalidate_iommu(elm);
+ need_atc |= elm->aelm.using_atc;
+ }
+ if (need_atc) {
+ list_for_each_entry_rcu(pos, &domain->alist.all,
+ aelm.all_item) {
+ if (pos->aelm.using_atc)
+ invalidate_atc(elm);
+ }
+ }
+ rcu_read_unlock();
+}