Re: [PATCH v2 4/7] iommu: Let iommu.strict override ops->def_domain_type

From: Jason Gunthorpe
Date: Tue Dec 06 2022 - 18:09:46 EST


On Mon, Dec 05, 2022 at 04:34:08PM +0100, Niklas Schnelle wrote:
> On Tue, 2022-11-29 at 16:09 -0400, Jason Gunthorpe wrote:
> > On Tue, Nov 29, 2022 at 06:41:22PM +0000, Robin Murphy wrote:
> > > On 2022-11-29 17:33, Jason Gunthorpe wrote:
> > > > On Mon, Nov 28, 2022 at 09:01:43PM +0000, Robin Murphy wrote:
> > > >
> > > > > I'm hardly an advocate for trying to save users from themselves, but I
> > > > > honestly can't see any justifiable reason for not having sysfs respect
> > > > > iommu_get_def_domain_type().
> > > >
> > > > We really need to rename this value if it is not actually just an
> > > > advisory "default" but a functional requirement ..
> > >
> > > It represents a required default domain type. As in, the type for the
> > > device's default domain. Not the default type for a domain. It's the
> > > iommu_def_domain_type variable that holds the *default* default domain type
> > > ;)
> >
> > I find the name "default domain" incredibly confusing at this point in
> > time.
>
> Yes it definitely confused me as evidenced by this patch.

I did some fiddling what this could rename could look like and came
up with this very sketchy sketch. I think it looks a lot clearer in
the end but it seems a bit tricky to break things up into nice patches.

Jason

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index beb9f535d3d815..052caf21fee3dd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -34,7 +34,12 @@
static struct kset *iommu_group_kset;
static DEFINE_IDA(iommu_group_ida);

-static unsigned int iommu_def_domain_type __read_mostly;
+static enum dma_api_policy {
+ DMA_API_POLICY_IDENTITY,
+ DMA_API_POLICY_STRICT,
+ DMA_API_POLICY_LAZY,
+ DMA_API_POLICY_ANY,
+} dma_api_default_policy __read_mostly;
static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
static u32 iommu_cmd_line __read_mostly;

@@ -82,7 +87,7 @@ static const char * const iommu_group_resv_type_string[] = {
static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static int iommu_alloc_default_domain(struct iommu_group *group,
- struct device *dev);
+ enum dma_api_policy policy);
static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus,
unsigned type);
static int __iommu_attach_device(struct iommu_domain *domain,
@@ -97,6 +102,9 @@ static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
static ssize_t iommu_group_store_type(struct iommu_group *group,
const char *buf, size_t count);

+static enum dma_api_policy
+iommu_get_dma_api_mandatory_policy(struct device *dev);
+
#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \
struct iommu_group_attribute iommu_group_attr_##_name = \
__ATTR(_name, _mode, _show, _store)
@@ -125,26 +133,6 @@ static struct bus_type * const iommu_buses[] = {
#endif
};

-/*
- * Use a function instead of an array here because the domain-type is a
- * bit-field, so an array would waste memory.
- */
-static const char *iommu_domain_type_str(unsigned int t)
-{
- switch (t) {
- case IOMMU_DOMAIN_BLOCKED:
- return "Blocked";
- case IOMMU_DOMAIN_IDENTITY:
- return "Passthrough";
- case IOMMU_DOMAIN_UNMANAGED:
- return "Unmanaged";
- case IOMMU_DOMAIN_DMA:
- return "Translated";
- default:
- return "Unknown";
- }
-}
-
static int __init iommu_subsys_init(void)
{
struct notifier_block *nb;
@@ -161,19 +149,27 @@ static int __init iommu_subsys_init(void)
}
}

- if (!iommu_default_passthrough() && !iommu_dma_strict)
- iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ;
+ if (dma_api_default_policy == DMA_API_POLICY_LAZY && iommu_dma_strict)
+ dma_api_default_policy = DMA_API_POLICY_STRICT;

- pr_info("Default domain type: %s %s\n",
- iommu_domain_type_str(iommu_def_domain_type),
- (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
- "(set via kernel command line)" : "");
-
- if (!iommu_default_passthrough())
- pr_info("DMA domain TLB invalidation policy: %s mode %s\n",
+ switch (dma_api_default_policy) {
+ case DMA_API_POLICY_LAZY:
+ case DMA_API_POLICY_STRICT:
+ pr_info("DMA translated domain TLB invalidation policy: %s mode %s\n",
iommu_dma_strict ? "strict" : "lazy",
(iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ?
- "(set via kernel command line)" : "");
+ "(set via kernel command line)" :
+ "");
+ break;
+ case DMA_API_POLICY_IDENTITY:
+ pr_info("Default domain type: Passthrough %s\n",
+ (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
+ "(set via kernel command line)" :
+ "");
+ break;
+ default:
+ break;
+ }

nb = kcalloc(ARRAY_SIZE(iommu_buses), sizeof(*nb), GFP_KERNEL);
if (!nb)
@@ -353,7 +349,8 @@ int iommu_probe_device(struct device *dev)
* checked.
*/
mutex_lock(&group->mutex);
- iommu_alloc_default_domain(group, dev);
+ iommu_alloc_default_domain(group,
+ iommu_get_dma_api_mandatory_policy(dev));

/*
* If device joined an existing group which has been claimed, don't
@@ -436,8 +433,8 @@ early_param("iommu.strict", iommu_dma_setup);
void iommu_set_dma_strict(void)
{
iommu_dma_strict = true;
- if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ)
- iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+ if (dma_api_default_policy == DMA_API_POLICY_LAZY)
+ dma_api_default_policy = DMA_API_POLICY_STRICT;
}

static ssize_t iommu_group_attr_show(struct kobject *kobj,
@@ -1557,53 +1554,100 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
}
EXPORT_SYMBOL_GPL(fsl_mc_device_group);

-static int iommu_get_def_domain_type(struct device *dev)
+static enum dma_api_policy
+iommu_get_dma_api_mandatory_policy(struct device *dev)
{
const struct iommu_ops *ops = dev_iommu_ops(dev);

if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
- return IOMMU_DOMAIN_DMA;
+ return DMA_API_POLICY_STRICT;

if (ops->def_domain_type)
return ops->def_domain_type(dev);
-
- return 0;
+ return DMA_API_POLICY_ANY;
}

-static int iommu_group_alloc_default_domain(struct bus_type *bus,
- struct iommu_group *group,
- unsigned int type)
+static int __iommu_get_dma_api_group_mandatory_policy(struct device *dev,
+ void *data)
{
- struct iommu_domain *dom;
+ enum dma_api_policy *policy = data;
+ enum dma_api_policy dev_policy =
+ iommu_get_dma_api_mandatory_policy(dev);

- dom = __iommu_domain_alloc(bus, type);
- if (!dom && type != IOMMU_DOMAIN_DMA) {
- dom = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
- if (dom)
- pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA",
- type, group->name);
+ if (dev_policy == DMA_API_POLICY_ANY || *policy == dev_policy)
+ return 0;
+ if (*policy == DMA_API_POLICY_ANY) {
+ *policy = dev_policy;
+ return 0;
}

- if (!dom)
- return -ENOMEM;
+ dev_warn(
+ dev,
+ "IOMMU driver is requesting different DMA API policies for devices in the same group %s.",
+ dev->iommu_group->name);

- group->default_domain = dom;
- if (!group->domain)
- group->domain = dom;
+ /*
+ * Resolve the conflict by priority, the lower numbers in the enum are
+ * higher preference in case of driver problems.
+ */
+ if (dev_policy < *policy)
+ *policy = dev_policy;
return 0;
}

+static enum dma_api_policy
+iommu_get_dma_api_group_mandatory_policy(struct iommu_group *group)
+{
+ enum dma_api_policy policy = DMA_API_POLICY_ANY;
+
+ __iommu_group_for_each_dev(group, &policy,
+ __iommu_get_dma_api_group_mandatory_policy);
+ return policy;
+}
+
static int iommu_alloc_default_domain(struct iommu_group *group,
- struct device *dev)
+ enum dma_api_policy policy)
{
- unsigned int type;
+ struct iommu_domain *domain;
+ struct group_device *grp_dev =
+ list_first_entry(&group->devices, struct group_device, list);
+ struct bus_type *bus = grp_dev->dev->bus;
+
+ lockdep_assert_held(group->mutex);

if (group->default_domain)
return 0;
+ if (policy == DMA_API_POLICY_ANY)
+ policy = dma_api_default_policy;
+ switch (policy) {
+ case DMA_API_POLICY_IDENTITY:
+ domain = __iommu_domain_alloc(bus, IOMMU_DOMAIN_IDENTITY);
+ break;
+
+ case DMA_API_POLICY_LAZY:
+ domain = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
+ if (domain && !domain->ops->prefer_dma_api_fq){
+ pr_warn("Failed to allocate default lazy IOMMU domain for group %s - Falling back to strict",
+ group->name);
+ }
+ break;

- type = iommu_get_def_domain_type(dev) ? : iommu_def_domain_type;
+ case DMA_API_POLICY_STRICT:
+ domain = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA);
+ break;

- return iommu_group_alloc_default_domain(dev->bus, group, type);
+ default:
+ WARN_ON(true);
+ domain = NULL;
+ }
+
+ if (!domain)
+ return -ENOMEM;
+
+ group->default_domain = domain;
+ if (!group->domain)
+ group->domain = domain;
+ return 0;
}

/**
@@ -1688,52 +1732,6 @@ static int iommu_bus_notifier(struct notifier_block *nb,
return 0;
}

-struct __group_domain_type {
- struct device *dev;
- unsigned int type;
-};
-
-static int probe_get_default_domain_type(struct device *dev, void *data)
-{
- struct __group_domain_type *gtype = data;
- unsigned int type = iommu_get_def_domain_type(dev);
-
- if (type) {
- if (gtype->type && gtype->type != type) {
- dev_warn(dev, "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n",
- iommu_domain_type_str(type),
- dev_name(gtype->dev),
- iommu_domain_type_str(gtype->type));
- gtype->type = 0;
- }
-
- if (!gtype->dev) {
- gtype->dev = dev;
- gtype->type = type;
- }
- }
-
- return 0;
-}
-
-static void probe_alloc_default_domain(struct bus_type *bus,
- struct iommu_group *group)
-{
- struct __group_domain_type gtype;
-
- memset(&gtype, 0, sizeof(gtype));
-
- /* Ask for default domain requirements of all devices in the group */
- __iommu_group_for_each_dev(group, &gtype,
- probe_get_default_domain_type);
-
- if (!gtype.type)
- gtype.type = iommu_def_domain_type;
-
- iommu_group_alloc_default_domain(bus, group, gtype.type);
-
-}
-
static int iommu_group_do_dma_attach(struct device *dev, void *data)
{
struct iommu_domain *domain = data;
@@ -1804,7 +1802,8 @@ int bus_iommu_probe(struct bus_type *bus)
mutex_lock(&group->mutex);

/* Try to allocate default domain */
- probe_alloc_default_domain(bus, group);
+ iommu_alloc_default_domain(
+ group, iommu_get_dma_api_group_mandatory_policy(group));

if (!group->default_domain) {
mutex_unlock(&group->mutex);
@@ -2600,19 +2599,19 @@ void iommu_set_default_passthrough(bool cmd_line)
{
if (cmd_line)
iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
- iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
+ dma_api_default_policy = DMA_API_POLICY_IDENTITY;
}

void iommu_set_default_translated(bool cmd_line)
{
if (cmd_line)
iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
- iommu_def_domain_type = IOMMU_DOMAIN_DMA;
+ dma_api_default_policy = DMA_API_POLICY_STRICT;
}

bool iommu_default_passthrough(void)
{
- return iommu_def_domain_type == IOMMU_DOMAIN_IDENTITY;
+ return dma_api_default_policy == DMA_API_POLICY_IDENTITY;
}
EXPORT_SYMBOL_GPL(iommu_default_passthrough);

@@ -2832,103 +2831,40 @@ EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
* group's default domain type through /sys/kernel/iommu_groups/<grp_id>/type
* Please take a closer look if intended to use for other purposes.
*/
-static int iommu_change_dev_def_domain(struct iommu_group *group,
- struct device *prev_dev, int type)
+static int iommu_change_group_dma_api_policy(struct iommu_group *group,
+ enum dma_api_policy policy)
{
- struct iommu_domain *prev_dom;
- struct group_device *grp_dev;
- int ret, dev_def_dom;
- struct device *dev;
-
- mutex_lock(&group->mutex);
-
- if (group->default_domain != group->domain) {
- dev_err_ratelimited(prev_dev, "Group not assigned to default domain\n");
- ret = -EBUSY;
- goto out;
- }
-
- /*
- * iommu group wasn't locked while acquiring device lock in
- * iommu_group_store_type(). So, make sure that the device count hasn't
- * changed while acquiring device lock.
- *
- * Changing default domain of an iommu group with two or more devices
- * isn't supported because there could be a potential deadlock. Consider
- * the following scenario. T1 is trying to acquire device locks of all
- * the devices in the group and before it could acquire all of them,
- * there could be another thread T2 (from different sub-system and use
- * case) that has already acquired some of the device locks and might be
- * waiting for T1 to release other device locks.
- */
- if (iommu_group_device_count(group) != 1) {
- dev_err_ratelimited(prev_dev, "Cannot change default domain: Group has more than one device\n");
- ret = -EINVAL;
- goto out;
- }
-
- /* Since group has only one device */
- grp_dev = list_first_entry(&group->devices, struct group_device, list);
- dev = grp_dev->dev;
-
- if (prev_dev != dev) {
- dev_err_ratelimited(prev_dev, "Cannot change default domain: Device has been changed\n");
- ret = -EBUSY;
- goto out;
- }
+ enum dma_api_policy mandatory =
+ iommu_get_dma_api_group_mandatory_policy(group);
+ struct iommu_domain *old_domain;
+ int ret;

- prev_dom = group->default_domain;
- if (!prev_dom) {
- ret = -EINVAL;
- goto out;
- }
+ if (mandatory != DMA_API_POLICY_ANY && policy != mandatory)
+ return -EINVAL;

- dev_def_dom = iommu_get_def_domain_type(dev);
- if (!type) {
- /*
- * If the user hasn't requested any specific type of domain and
- * if the device supports both the domains, then default to the
- * domain the device was booted with
- */
- type = dev_def_dom ? : iommu_def_domain_type;
- } else if (dev_def_dom && type != dev_def_dom) {
- dev_err_ratelimited(prev_dev, "Device cannot be in %s domain\n",
- iommu_domain_type_str(type));
- ret = -EINVAL;
- goto out;
- }
+ mutex_lock(&group->mutex);

- /*
- * Switch to a new domain only if the requested domain type is different
- * from the existing default domain type
- */
- if (prev_dom->type == type) {
- ret = 0;
- goto out;
+ /* Shortcut if FQ is being enabled */
+ if (group->default_domain->type == IOMMU_DOMAIN_DMA &&
+ policy == DMA_API_POLICY_LAZY) {
+ ret = iommu_dma_init_fq(group->default_domain);
+ if (!ret) {
+ group->default_domain->type = IOMMU_DOMAIN_DMA_FQ;
+ mutex_unlock(&group->mutex);
+ return 0;
+ }
}

- /* We can bring up a flush queue without tearing down the domain */
- if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) {
- ret = iommu_dma_init_fq(prev_dom);
- if (!ret)
- prev_dom->type = IOMMU_DOMAIN_DMA_FQ;
- goto out;
+ /* Otherwise just replace the whole domain */
+ old_domain = group->default_domain;
+ group->default_domain = NULL;
+ ret = iommu_alloc_default_domain(group, policy);
+ if (ret) {
+ group->default_domain = old_domain;
+ mutex_unlock(&group->mutex);
+ return ret;
}
-
- /* Sets group->default_domain to the newly allocated domain */
- ret = iommu_group_alloc_default_domain(dev->bus, group, type);
- if (ret)
- goto out;
-
- ret = iommu_create_device_direct_mappings(group, dev);
- if (ret)
- goto free_new_domain;
-
- ret = __iommu_attach_device(group->default_domain, dev);
- if (ret)
- goto free_new_domain;
-
- group->domain = group->default_domain;
+ iommu_group_create_direct_mappings(group);

/*
* Release the mutex here because ops->probe_finalize() call-back of
@@ -2938,20 +2874,15 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
*/
mutex_unlock(&group->mutex);

+ /*
+ * FIXME: How does iommu_setup_dma_ops() get called with the new domain
+ * on ARM?
+ */
+
/* Make sure dma_ops is appropriatley set */
- iommu_group_do_probe_finalize(dev, group->default_domain);
- iommu_domain_free(prev_dom);
+ __iommu_group_dma_finalize(group);
+ iommu_domain_free(old_domain);
return 0;
-
-free_new_domain:
- iommu_domain_free(group->default_domain);
- group->default_domain = prev_dom;
- group->domain = prev_dom;
-
-out:
- mutex_unlock(&group->mutex);
-
- return ret;
}

/*
@@ -2966,9 +2897,8 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
static ssize_t iommu_group_store_type(struct iommu_group *group,
const char *buf, size_t count)
{
- struct group_device *grp_dev;
- struct device *dev;
- int ret, req_type;
+ enum dma_api_policy policy;
+ int ret;

if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
return -EACCES;
@@ -2977,77 +2907,30 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
return -EINVAL;

if (sysfs_streq(buf, "identity"))
- req_type = IOMMU_DOMAIN_IDENTITY;
+ policy = DMA_API_POLICY_IDENTITY;
else if (sysfs_streq(buf, "DMA"))
- req_type = IOMMU_DOMAIN_DMA;
+ policy = DMA_API_POLICY_STRICT;
else if (sysfs_streq(buf, "DMA-FQ"))
- req_type = IOMMU_DOMAIN_DMA_FQ;
+ policy = DMA_API_POLICY_LAZY;
else if (sysfs_streq(buf, "auto"))
- req_type = 0;
+ policy = DMA_API_POLICY_ANY;
else
return -EINVAL;

/*
- * Lock/Unlock the group mutex here before device lock to
- * 1. Make sure that the iommu group has only one device (this is a
- * prerequisite for step 2)
- * 2. Get struct *dev which is needed to lock device
- */
- mutex_lock(&group->mutex);
- if (iommu_group_device_count(group) != 1) {
- mutex_unlock(&group->mutex);
- pr_err_ratelimited("Cannot change default domain: Group has more than one device\n");
- return -EINVAL;
- }
-
- /* Since group has only one device */
- grp_dev = list_first_entry(&group->devices, struct group_device, list);
- dev = grp_dev->dev;
- get_device(dev);
-
- /*
- * Don't hold the group mutex because taking group mutex first and then
- * the device lock could potentially cause a deadlock as below. Assume
- * two threads T1 and T2. T1 is trying to change default domain of an
- * iommu group and T2 is trying to hot unplug a device or release [1] VF
- * of a PCIe device which is in the same iommu group. T1 takes group
- * mutex and before it could take device lock assume T2 has taken device
- * lock and is yet to take group mutex. Now, both the threads will be
- * waiting for the other thread to release lock. Below, lock order was
- * suggested.
- * device_lock(dev);
- * mutex_lock(&group->mutex);
- * iommu_change_dev_def_domain();
- * mutex_unlock(&group->mutex);
- * device_unlock(dev);
- *
- * [1] Typical device release path
- * device_lock() from device/driver core code
- * -> bus_notifier()
- * -> iommu_bus_notifier()
- * -> iommu_release_device()
- * -> ops->release_device() vendor driver calls back iommu core code
- * -> mutex_lock() from iommu core code
+ * Taking ownership disables the DMA API domain, prevents drivers from
+ * being attached, and switches to a blocking domain. Releasing
+ * ownership will put back the new or original DMA API domain.
*/
- mutex_unlock(&group->mutex);
-
- /* Check if the device in the group still has a driver bound to it */
- device_lock(dev);
- if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
- group->default_domain->type == IOMMU_DOMAIN_DMA)) {
- pr_err_ratelimited("Device is still bound to driver\n");
- ret = -EBUSY;
- goto out;
- }
-
- ret = iommu_change_dev_def_domain(group, dev, req_type);
- ret = ret ?: count;
-
-out:
- device_unlock(dev);
- put_device(dev);
+ ret = iommu_group_claim_dma_owner(group, &ret);
+ if (ret)
+ return ret;

- return ret;
+ ret = iommu_change_group_dma_api_policy(group, policy);
+ iommu_group_release_dma_owner(group);
+ if (ret)
+ return ret;
+ return count;
}

static bool iommu_is_default_domain(struct iommu_group *group)