Re: [PATCH v2 1/3] iommu/s390: Fix duplicate domain attachments

From: Jason Gunthorpe
Date: Mon Sep 26 2022 - 11:11:45 EST


On Mon, Sep 26, 2022 at 11:00:53AM +0200, Niklas Schnelle wrote:

> > > +out_unregister_restore:
> > > + zpci_unregister_ioat(zdev, 0);
> > > out_restore:
> > > - if (!zdev->s390_domain) {
> > > + zdev->dma_table = NULL;
> > > + if (prev_domain)
> > > + s390_iommu_attach_device(&prev_domain->domain,
> > > + dev);
> >
> > Huh. That is a surprising thing
> >
> > I think this function needs some re-ordering to avoid this condition
> >
> > The checks for aperture should be earlier, and they are not quite
> > right. The aperture is only allowed to grow. If it starts out as 0 and
> > then is set to something valid on first attach, a later attach cannot
> > then shrink it. There could already be mappings in the domain under
> > the now invalidated aperture and no caller is prepared to deal with
> > this.
>
> Ohh I think this is indeed broken. Let me rephrase to see if I
> understand correctly. You're saying that while we only allow exactly
> matching apertures on additional attaches, we do allow shrinking if
> there is temporarily no device attached to the domain. That part is
> then broken because there could still be mappings outside the new
> aperture stored in the translation tables?

Right, go from 0 -> sized apperture on first attach, and then once it
is sized it doesn't change again.

> > That leaves the only error case as zpci_register_ioat() - which seems
> > like it is the actual "attach" operation. Since
> > __s390_iommu_detach_device() is just internal accounting (and can't
> > fail) it should be moved after
> >
> > So the logic order should be
> >
> > 1) Attempt to widen the aperture, if this fails the domain is
> > incompatible bail immediately
>
> Question. If the widening succeeds but we fail later during the attach
> e.g. in 2) then the aperture remains widend or would that be rolled
> back?

I'd leave it widened.

IMHO I don't like this trick of setting the aperture on attach. It is
logically wrong. The aperture is part of the configuration of the page
table itself. The domain should know what page table format and thus
apterture it has the moment it is allocated. Usually this is the
related to the number of levels in the radix tree.

It seems to me that the issue here is trying to use the aperture when
the reserved region is the appropriate tool.

eg I see that s390_domain_alloc calls dma_alloc_cpu_table() which just
allocates a 3 level radix tree. This means it has a specific max
address that can be passed to dma_walk_cpu_trans(). So the aperture
should be fixed based on the radix tree parameters.

The device specific start/end should be represented as a reserved
regions per-device. See patch below..

This is meaningful because it effects when VFIO can share the domains
across devices. If devices have different reserved ranges we can still
share domains, so long as no mapping is placed in the union of the
reserved ranges. However if you vary the aperture, like is currently
happening, then the domains become unsharable.

diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index c898bcbbce118f..ba80325da76cd9 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -51,6 +51,12 @@ static bool s390_iommu_capable(enum iommu_cap cap)
}
}

+/*
+ * dma_alloc_cpu_table() creates a 3 level table, rtx, sx, px, so the address
+ * that can be passed to dma_walk_cpu_trans() must be less than this.
+ */
+#define MAX_DMA_TABLE_ADDR (ZPCI_TABLE_SIZE * ZPCI_TABLE_SIZE * ZPCI_PT_SIZE)
+
static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
{
struct s390_domain *s390_domain;
@@ -68,6 +74,10 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
return NULL;
}

+ s390_domain->domain.geometry.force_aperture = true;
+ s390_domain->domain.geometry.aperture_start = 0;
+ s390_domain->domain.geometry.aperture_end = MAX_DMA_TABLE_ADDR;
+
spin_lock_init(&s390_domain->dma_table_lock);
spin_lock_init(&s390_domain->list_lock);
INIT_LIST_HEAD(&s390_domain->devices);
@@ -119,18 +129,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
}

spin_lock_irqsave(&s390_domain->list_lock, flags);
- /* First device defines the DMA range limits */
- if (list_empty(&s390_domain->devices)) {
- domain->geometry.aperture_start = zdev->start_dma;
- domain->geometry.aperture_end = zdev->end_dma;
- domain->geometry.force_aperture = true;
- /* Allow only devices with identical DMA range limits */
- } else if (domain->geometry.aperture_start != zdev->start_dma ||
- domain->geometry.aperture_end != zdev->end_dma) {
- rc = -EINVAL;
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
- goto out_restore;
- }
domain_device->zdev = zdev;
zdev->s390_domain = s390_domain;
list_add(&domain_device->list, &s390_domain->devices);
@@ -152,6 +150,30 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
return rc;
}

+static void s390_iommu_get_resv_regions(struct device *dev,
+ struct list_head *list)
+{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct iommu_resv_region *region;
+
+ if (zdev->start_dma) {
+ region = iommu_alloc_resv_region(0, zdev->start_dma, 0,
+ IOMMU_RESV_RESERVED);
+ if (!region)
+ return;
+ list_add_tail(&region->list, list);
+ }
+
+ if (zdev->end_dma < MAX_DMA_TABLE_ADDR) {
+ region = iommu_alloc_resv_region(
+ zdev->end_dma, MAX_DMA_TABLE_ADDR - zdev->end_dma, 0,
+ IOMMU_RESV_RESERVED);
+ if (!region)
+ return;
+ list_add_tail(&region->list, list);
+ }
+}
+
static void s390_iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
@@ -376,6 +398,7 @@ static const struct iommu_ops s390_iommu_ops = {
.release_device = s390_iommu_release_device,
.device_group = generic_device_group,
.pgsize_bitmap = S390_IOMMU_PGSIZES,
+ .get_resv_regions = s390_iommu_get_resv_regions,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = s390_iommu_attach_device,
.detach_dev = s390_iommu_detach_device,