[PATCH v1 7/9] iommu/tegra: gart: Provide single domain and group for all devices

From: Dmitry Osipenko
Date: Fri May 11 2018 - 16:10:02 EST


On 11.05.2018 15:32, Robin Murphy wrote:
> On 08/05/18 19:16, Dmitry Osipenko wrote:
>> GART aperture is shared by all devices, hence there is a single IOMMU
>> domain and group shared by these devices. Allocation of a group per
>> device only wastes resources and allowance of having more than one domain
>> is simply wrong because IOMMU mappings made by the users of "different"
>> domains will stomp on each other.
>
> Strictly, that reasoning is a bit backwards - allocating multiple groups is the
> conceptually-wrong thing if the GART cannot differentiate between different
> devices, whereas having multiple domains *exist* is no real problem, it's merely
> that only one can be active at any point in time (which will inherently become
> the case once all devices are grouped together).

IIUC, the IOMMU domain represents the address space. There is only one address
space in a case of GART, the GART's aperture. So GART not only isn't
differentiating between different devices, but also between different domains.

>> Signed-off-by: Dmitry Osipenko <digetx@xxxxxxxxx>
>> ---
>> Â drivers/iommu/tegra-gart.c | 107 +++++++++----------------------------
>> Â 1 file changed, 24 insertions(+), 83 deletions(-)
>>
>> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
>> index 5b2d27620350..ebc105c201bd 100644
>> --- a/drivers/iommu/tegra-gart.c
>> +++ b/drivers/iommu/tegra-gart.c
>> @@ -19,7 +19,6 @@
>> Â Â #include <linux/io.h>
>> Â #include <linux/iommu.h>
>> -#include <linux/list.h>
>> Â #include <linux/module.h>
>> Â #include <linux/of_device.h>
>> Â #include <linux/slab.h>
>> @@ -44,22 +43,17 @@
>> Â #define GART_PAGE_MASKÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ \
>> ÂÂÂÂÂ (~(GART_PAGE_SIZE - 1) & ~GART_ENTRY_PHYS_ADDR_VALID)
>> Â -struct gart_client {
>> -ÂÂÂ struct deviceÂÂÂÂÂÂÂ *dev;
>> -ÂÂÂ struct list_headÂÂÂ list;
>> -};
>> -
>> Â struct gart_device {
>> ÂÂÂÂÂ void __iomemÂÂÂÂÂÂÂ *regs;
>> ÂÂÂÂÂ u32ÂÂÂÂÂÂÂÂÂÂÂ *savedata;
>> ÂÂÂÂÂ u32ÂÂÂÂÂÂÂÂÂÂÂ page_count;ÂÂÂ /* total remappable size */
>> ÂÂÂÂÂ dma_addr_tÂÂÂÂÂÂÂ iovmm_base;ÂÂÂ /* offset to vmm_area */
>> ÂÂÂÂÂ spinlock_tÂÂÂÂÂÂÂ pte_lock;ÂÂÂ /* for pagetable */
>> -ÂÂÂ struct list_headÂÂÂ client;
>> -ÂÂÂ spinlock_tÂÂÂÂÂÂÂ client_lock;ÂÂÂ /* for client list */
>> ÂÂÂÂÂ struct deviceÂÂÂÂÂÂÂ *dev;
>> Â ÂÂÂÂÂ struct iommu_deviceÂÂÂ iommu;ÂÂÂÂÂÂÂ /* IOMMU Core handle */
>> +ÂÂÂ struct iommu_groupÂÂÂ *group;ÂÂÂÂÂÂÂ /* Common IOMMU group */
>> +ÂÂÂ struct gart_domainÂÂÂ *domain;ÂÂÂ /* Unique IOMMU domain */
>> Â ÂÂÂÂÂ struct tegra_mc_gart_handle mc_gart_handle;
>> Â };
>> @@ -169,81 +163,31 @@ static inline bool gart_iova_range_valid(struct
>> gart_device *gart,
>> Â static int gart_iommu_attach_dev(struct iommu_domain *domain,
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct device *dev)
>> Â {
>> -ÂÂÂ struct gart_domain *gart_domain = to_gart_domain(domain);
>> -ÂÂÂ struct gart_device *gart = gart_domain->gart;
>> -ÂÂÂ struct gart_client *client, *c;
>> -ÂÂÂ int err = 0;
>> -
>> -ÂÂÂ client = devm_kzalloc(gart->dev, sizeof(*c), GFP_KERNEL);
>> -ÂÂÂ if (!client)
>> -ÂÂÂÂÂÂÂ return -ENOMEM;
>> -ÂÂÂ client->dev = dev;
>> -
>> -ÂÂÂ spin_lock(&gart->client_lock);
>> -ÂÂÂ list_for_each_entry(c, &gart->client, list) {
>> -ÂÂÂÂÂÂÂ if (c->dev == dev) {
>> -ÂÂÂÂÂÂÂÂÂÂÂ dev_err(gart->dev,
>> -ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ "%s is already attached\n", dev_name(dev));
>> -ÂÂÂÂÂÂÂÂÂÂÂ err = -EINVAL;
>> -ÂÂÂÂÂÂÂÂÂÂÂ goto fail;
>> -ÂÂÂÂÂÂÂ }
>> -ÂÂÂ }
>> -ÂÂÂ list_add(&client->list, &gart->client);
>> -ÂÂÂ spin_unlock(&gart->client_lock);
>> -ÂÂÂ dev_dbg(gart->dev, "Attached %s\n", dev_name(dev));
>> ÂÂÂÂÂ return 0;
>> -
>> -fail:
>> -ÂÂÂ devm_kfree(gart->dev, client);
>> -ÂÂÂ spin_unlock(&gart->client_lock);
>> -ÂÂÂ return err;
>> Â }
>> Â Â static void gart_iommu_detach_dev(struct iommu_domain *domain,
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct device *dev)
>> Â {
>> -ÂÂÂ struct gart_domain *gart_domain = to_gart_domain(domain);
>> -ÂÂÂ struct gart_device *gart = gart_domain->gart;
>> -ÂÂÂ struct gart_client *c;
>> -
>> -ÂÂÂ spin_lock(&gart->client_lock);
>> -
>> -ÂÂÂ list_for_each_entry(c, &gart->client, list) {
>> -ÂÂÂÂÂÂÂ if (c->dev == dev) {
>> -ÂÂÂÂÂÂÂÂÂÂÂ list_del(&c->list);
>> -ÂÂÂÂÂÂÂÂÂÂÂ devm_kfree(gart->dev, c);
>> -ÂÂÂÂÂÂÂÂÂÂÂ dev_dbg(gart->dev, "Detached %s\n", dev_name(dev));
>> -ÂÂÂÂÂÂÂÂÂÂÂ goto out;
>> -ÂÂÂÂÂÂÂ }
>> -ÂÂÂ }
>> -ÂÂÂ dev_err(gart->dev, "Couldn't find\n");
>> -out:
>> -ÂÂÂ spin_unlock(&gart->client_lock);
>> Â }
>
> The .detach_dev callback is optional in the core API now, so you can just remove
> the whole thing.

Good catch, thanks!

>
>> Â static struct iommu_domain *gart_iommu_domain_alloc(unsigned type)
>> Â {
>> -ÂÂÂ struct gart_domain *gart_domain;
>> -ÂÂÂ struct gart_device *gart;
>> -
>> -ÂÂÂ if (type != IOMMU_DOMAIN_UNMANAGED)
>> -ÂÂÂÂÂÂÂ return NULL;
>> +ÂÂÂ struct gart_device *gart = gart_handle;
>> Â -ÂÂÂ gart = gart_handle;
>> -ÂÂÂ if (!gart)
>> +ÂÂÂ if (type != IOMMU_DOMAIN_UNMANAGED || gart->domain)
>
> Singleton domains are a little unpleasant given the way the IOMMU API expects
> things to work, but it looks fairly simple to avoid needing that at all. AFAICS
> you could move gart->savedata to something like gart_domain->ptes and keep it
> up-to-date in .map/.unmap, then in .attach_dev you just need to do something like:
>
> ÂÂÂÂif (gart_domain != gart->domain) {
> ÂÂÂÂÂÂÂ do_gart_setup(gart, gart_domain->ptes);
> ÂÂÂÂÂÂÂ gart->domain = gart_domain;
> ÂÂÂÂ}
>
> to context-switch the hardware state when moving the group from one domain to
> another (and as a bonus you would no longer need to do anything for suspend,
> since resume can just look at the current domain too). If in practice there's
> only ever one domain allocated anyway, then there's no difference in memory
> overhead, but you still have the benefit of the driver being more consistent
> with others and allowing that flexibility if anyone ever did want to play with it.

For the starter we'll have a single domain solely used by GPU with all its
sub-devices. Context switching will be handled by the Tegra's DRM driver. Later
we may consider introducing IOMMU support for the video decoder, at least to
provide memory isolation for the buffers to which decoder performs writing.

Cross-driver context switching isn't that straightforward and I think Tegra-GART
driver shouldn't take care of context switching in any form and only perform
mapping / unmapping operations. There are couple variants of how to deal with
the context switching:

1. A simple solution could be to logically split the GART's aperture space into
different domains, but GART's aperture won't be utilized efficiently with this
approach, wasting IOVA space quite a lot.

2. In order to utilize aperture more efficiently, we are going to make DRM
driver to cache IOMMU mappings such that graphics buffer will be moved to the
cache-eviction list on unmapping and actually unmapped when that buffer isn't
in-use and there is no IOVA space for another buffer or on the buffers
destruction. We'll use DRM's MM scanning helper for that [0][1]. Maybe we could
share access to that MM helper with the video decoder somehow. Seems IOMMU API
isn't tailored for a such use-case, so probably having a custom
platform-specific API on top of the IOMMU API would be fine and with that we
could have cross-device/driver context switching handled by the custom API.

Please let me know if you have any other variants to suggest.

[0]
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/tree/include/drm/drm_mm.h
[1]
https://github.com/grate-driver/linux/commit/16e017efaa343e23e5a7d2d498915764cc806054

>
>> ÂÂÂÂÂÂÂÂÂ return NULL;
>> Â -ÂÂÂ gart_domain = kzalloc(sizeof(*gart_domain), GFP_KERNEL);
>> -ÂÂÂ if (!gart_domain)
>> -ÂÂÂÂÂÂÂ return NULL;
>> -
>> -ÂÂÂ gart_domain->gart = gart;
>> -ÂÂÂ gart_domain->domain.geometry.aperture_start = gart->iovmm_base;
>> -ÂÂÂ gart_domain->domain.geometry.aperture_end = gart->iovmm_base +
>> +ÂÂÂ gart->domain = kzalloc(sizeof(*gart->domain), GFP_KERNEL);
>> +ÂÂÂ if (gart->domain) {
>> +ÂÂÂÂÂÂÂ gart->domain->domain.geometry.aperture_start = gart->iovmm_base;
>> +ÂÂÂÂÂÂÂ gart->domain->domain.geometry.aperture_end = gart->iovmm_base +
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ gart->page_count * GART_PAGE_SIZE - 1;
>> -ÂÂÂ gart_domain->domain.geometry.force_aperture = true;
>> +ÂÂÂÂÂÂÂ gart->domain->domain.geometry.force_aperture = true;
>> +ÂÂÂÂÂÂÂ gart->domain->gart = gart;
>> +ÂÂÂ }
>> Â -ÂÂÂ return &gart_domain->domain;
>> +ÂÂÂ return &gart->domain->domain;
>> Â }
>> Â Â static void gart_iommu_domain_free(struct iommu_domain *domain)
>> @@ -251,18 +195,7 @@ static void gart_iommu_domain_free(struct iommu_domain
>> *domain)
>> ÂÂÂÂÂ struct gart_domain *gart_domain = to_gart_domain(domain);
>> ÂÂÂÂÂ struct gart_device *gart = gart_domain->gart;
>> Â -ÂÂÂ if (gart) {
>> -ÂÂÂÂÂÂÂ spin_lock(&gart->client_lock);
>> -ÂÂÂÂÂÂÂ if (!list_empty(&gart->client)) {
>> -ÂÂÂÂÂÂÂÂÂÂÂ struct gart_client *c;
>> -
>> -ÂÂÂÂÂÂÂÂÂÂÂ list_for_each_entry(c, &gart->client, list)
>> -ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ gart_iommu_detach_dev(domain, c->dev);
>> -ÂÂÂÂÂÂÂ }
>> -ÂÂÂÂÂÂÂ spin_unlock(&gart->client_lock);
>> -ÂÂÂ }
>> -
>> -ÂÂÂ kfree(gart_domain);
>> +ÂÂÂ kfree(gart->domain);
>> Â }
>> Â Â static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
>> @@ -377,7 +310,7 @@ struct iommu_group *gart_iommu_device_group(struct device
>> *dev)
>> ÂÂÂÂÂ if (err)
>> ÂÂÂÂÂÂÂÂÂ return ERR_PTR(err);
>> Â -ÂÂÂ return generic_device_group(dev);
>> +ÂÂÂ return gart_handle->group;
>
> You should take a reference per device, i.e.:
>
> ÂÂÂÂreturn iommu_group_ref_get(gart_handle->group);
>
> otherwise removing devices could unbalance things and result in the group
> getting freed prematurely.

Seems more correctly would be to remove iommu_group_put() from
gart_iommu_add_device().

>
>> Â }
>> Â Â static int gart_iommu_of_xlate(struct device *dev,
>> @@ -502,8 +435,6 @@ static int tegra_gart_probe(struct platform_device *pdev)
>> Â ÂÂÂÂÂ gart->dev = &pdev->dev;
>> ÂÂÂÂÂ spin_lock_init(&gart->pte_lock);
>> -ÂÂÂ spin_lock_init(&gart->client_lock);
>> -ÂÂÂ INIT_LIST_HEAD(&gart->client);
>> ÂÂÂÂÂ gart->regs = gart_regs;
>> ÂÂÂÂÂ gart->iovmm_base = (dma_addr_t)res_remap->start;
>> ÂÂÂÂÂ gart->page_count = (resource_size(res_remap) >> GART_PAGE_SHIFT);
>> @@ -517,6 +448,14 @@ static int tegra_gart_probe(struct platform_device *pdev)
>> ÂÂÂÂÂÂÂÂÂ goto iommu_unregister;
>> ÂÂÂÂÂ }
>> Â +ÂÂÂ gart->group = iommu_group_alloc();
>> +ÂÂÂ if (IS_ERR(gart->group)) {
>> +ÂÂÂÂÂÂÂ ret = PTR_ERR(gart->group);
>> +ÂÂÂÂÂÂÂ goto free_savedata;
>> +ÂÂÂ }
>> +
>> +ÂÂÂ iommu_group_ref_get(gart->group);
>
> You already hold the initial reference from iommu_group_alloc(), so there's no
> need to take a second one at this point.

Yes, looks like this refcount-bump isn't needed here. I'll revisit the
refcountings and correct them in v2 where necessary.

Thank you very much for the review.