[RFCv2 PATCH 6/7] iommu/amd: Add nested domain allocation support

From: Suravee Suthikulpanit
Date: Thu Jan 11 2024 - 19:09:11 EST


To support nested translation, the parent domain is allocated with flag
IOMMU_HWPT_ALLOC_NEST_PARENT, and stores information of the v1 page table
for stage 2 (i.e. GPA->SPA), whereas the child domain stores information
of the GCR3 root pointer table for stage 1 (i.e. GVA->GPA).

Modify the current driver to handle the domain allocation with type
IOMMU_DOMAIN_NESTED. Also, when allocating the child domain (with the
parent domain is specified), keeps track the parent using the struct
protection_domain.parent.

Note that current implementation requires AMD IOMMU GCR3TRPMode feature,
which program DTE[GCR3 Table Root Pointer] with the GPA provided by the
guest via struct iommu_hwpt_amd_v2, which is passed as a parameter of
the struct iommu_ops.domain_alloc_user().

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
---
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 10 +++
drivers/iommu/amd/amd_iommu_types.h | 6 ++
drivers/iommu/amd/iommu.c | 96 ++++++++++++++++++++++++++---
drivers/iommu/amd/nested.c | 75 ++++++++++++++++++++++
5 files changed, 181 insertions(+), 8 deletions(-)
create mode 100644 drivers/iommu/amd/nested.c

diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index f454fbb1569e..447cb6bb48eb 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o
+obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o nested.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 4118129f4a24..bb25d7c3bff5 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -7,6 +7,7 @@
#ifndef AMD_IOMMU_H
#define AMD_IOMMU_H

+#include <uapi/linux/iommufd.h>
#include <linux/iommu.h>

#include "amd_iommu_types.h"
@@ -182,4 +183,13 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);

extern bool amd_iommu_snp_en;
+
+/* NESTED */
+struct protection_domain *to_pdomain(struct iommu_domain *dom);
+bool amd_iommu_domain_is_nested(struct protection_domain *pdom);
+struct iommu_domain *
+amd_iommu_nested_domain_alloc(struct device *dev, unsigned int type, u32 flags,
+ struct iommu_hwpt_amd_v2 *hwpt,
+ struct iommu_domain *parent);
+
#endif
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index f8baa8d88832..db77b050a496 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -110,6 +110,8 @@
#define FEATURE_PASMAX_MASK (0x1FULL << FEATURE_PASMAX_SHIFT)

/* Extended Feature 2 Bits */
+#define FEATURE_GCR3TRPMODE BIT_ULL(3)
+
#define FEATURE_SNPAVICSUP_SHIFT 5
#define FEATURE_SNPAVICSUP_MASK (0x07ULL << FEATURE_SNPAVICSUP_SHIFT)
#define FEATURE_SNPAVICSUP_GAM(x) \
@@ -535,6 +537,7 @@ struct amd_irte_ops;

struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
+ u64 trp_gpa; /* Guest CR3 TRP GPA for nested domain */
int glx; /* Number of levels for GCR3 table */
u32 pasid_cnt; /* Track attached PASIDs */
bool giov; /* Track DTE[GIOV] */
@@ -569,6 +572,9 @@ struct protection_domain {
bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
+ struct protection_domain *parent; /* Nested parent domain */
+ u16 guest_paging_mode; /* Guest paging mode */
+ u16 guest_domain_id; /* Guest domain ID */
};

/*
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 134f4af921dc..51716fa5ccb5 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -77,11 +77,16 @@ struct iommu_cmd {

struct kmem_cache *amd_iommu_irq_cache;

+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev);
+
static void detach_device(struct device *dev);

static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data);

+static void amd_iommu_domain_free(struct iommu_domain *dom);
+
/****************************************************************************
*
* Helper functions
@@ -191,7 +196,7 @@ static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
}

-static struct protection_domain *to_pdomain(struct iommu_domain *dom)
+struct protection_domain *to_pdomain(struct iommu_domain *dom)
{
return container_of(dom, struct protection_domain, domain);
}
@@ -2367,8 +2372,9 @@ static struct protection_domain *protection_domain_alloc(unsigned int type)
domain->nid = NUMA_NO_NODE;

switch (type) {
- /* No need to allocate io pgtable ops in passthrough mode */
+ /* No need to allocate io pgtable ops in passthrough and nested mode */
case IOMMU_DOMAIN_IDENTITY:
+ case IOMMU_DOMAIN_NESTED:
return domain;
case IOMMU_DOMAIN_DMA:
pgtable = amd_iommu_pgtable;
@@ -2423,7 +2429,12 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu)
return iommu && (iommu->features & FEATURE_HDSUP);
}

-static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+static const struct iommu_domain_ops nested_domain_ops = {
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+};
+
+struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
struct device *dev, u32 flags)
{
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
@@ -2454,7 +2465,10 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
if (iommu) {
domain->domain.type = type;
domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+ if (type == IOMMU_DOMAIN_NESTED)
+ domain->domain.ops = &nested_domain_ops;
+ else
+ domain->domain.ops = iommu->iommu.ops->default_domain_ops;

if (dirty_tracking)
domain->domain.dirty_ops = &amd_dirty_ops;
@@ -2474,18 +2488,86 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
return domain;
}

+static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data,
+ struct iommu_hwpt_amd_v2 *hwpt)
+{
+ if (!user_data)
+ return -EINVAL;
+
+ if (user_data->type != IOMMU_HWPT_DATA_AMD_V2)
+ return -EOPNOTSUPP;
+
+ return iommu_copy_struct_from_user(hwpt, user_data,
+ IOMMU_HWPT_DATA_AMD_V2,
+ __reserved);
+}
+
+static bool check_nested_support(u32 flags)
+{
+ if (!(flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+ return true;
+
+ if (!check_feature(FEATURE_GT) ||
+ !check_feature(FEATURE_GIOSUP) ||
+ !check_feature2(FEATURE_GCR3TRPMODE))
+ return false;
+
+ return true;
+}
+
+static u32 amd_iommu_hwpt_supported_flags =
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
+
static struct iommu_domain *
amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
struct iommu_domain *parent,
const struct iommu_user_data *user_data)
-
{
+ struct iommu_domain *dom;
+ struct iommu_dev_data *dev_data;
unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+ bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+
+ if (parent) {
+ int ret;
+ struct iommu_hwpt_amd_v2 hwpt;
+
+ if (parent->ops != amd_iommu_ops.default_domain_ops)
+ return ERR_PTR(-EINVAL);
+
+ ret = udata_to_iommu_hwpt_amd_v2(user_data, &hwpt);
+ if (ret)
+ return ERR_PTR(ret);

- if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+ return amd_iommu_nested_domain_alloc(dev, type, flags,
+ &hwpt, parent);
+ }
+
+ /* Check supported flags */
+ if ((flags & ~amd_iommu_hwpt_supported_flags) ||
+ !check_nested_support(flags))
return ERR_PTR(-EOPNOTSUPP);

- return do_iommu_domain_alloc(type, dev, flags);
+ dev_data = dev_iommu_priv_get(dev);
+
+ /*
+ * When allocated nested parent domain, the device may already
+ * have been attached to a domain. For example, a device is already
+ * attached to the domain allocated by VFIO, which contains GPA->SPA mapping.
+ * In such case, return reference to the same domain.
+ */
+ if (dev_data->domain && nested_parent) {
+ pr_debug("%s: Found exist: protection domain id=%#x\n",
+ __func__, dev_data->domain->id);
+ dom = &dev_data->domain->domain;
+ } else {
+ dom = do_iommu_domain_alloc(type, dev, flags);
+ if (!dom)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return dom;
}

static void amd_iommu_domain_free(struct iommu_domain *dom)
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
new file mode 100644
index 000000000000..1addcb21a38c
--- /dev/null
+++ b/drivers/iommu/amd/nested.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ * Author: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
+ */
+
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+#define dev_fmt(fmt) pr_fmt(fmt)
+
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "amd_iommu.h"
+
+bool amd_iommu_domain_is_nested(struct protection_domain *pdom)
+{
+ return (pdom && pdom->parent != NULL);
+}
+
+static int nested_gcr3_update(struct iommu_hwpt_amd_v2 *hwpt,
+ struct protection_domain *pdom,
+ struct protection_domain *ppdom,
+ struct device *dev)
+{
+ struct pci_dev *pdev;
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+
+ pdev = to_pci_dev(dev);
+ if (!pdev)
+ return -EINVAL;
+
+ /* Note: Currently only support GCR3TRPMode with nested translation */
+ if (!check_feature2(FEATURE_GCR3TRPMODE))
+ return -EOPNOTSUPP;
+
+ pdom->parent = ppdom;
+ pdom->guest_domain_id = hwpt->gdom_id;
+ pdom->guest_paging_mode = hwpt->flags.guest_paging_mode;
+
+ dev_data->gcr3_info.trp_gpa = hwpt->gcr3;
+ dev_data->gcr3_info.glx = hwpt->flags.glx;
+ dev_data->gcr3_info.giov = hwpt->flags.giov;
+
+ return 0;
+}
+
+struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+ struct device *dev, u32 flags);
+struct iommu_domain *
+amd_iommu_nested_domain_alloc(struct device *dev, unsigned int type, u32 flags,
+ struct iommu_hwpt_amd_v2 *hwpt,
+ struct iommu_domain *parent)
+{
+ int ret;
+ struct iommu_domain *dom;
+ struct protection_domain *pdom;
+
+ pr_debug("%s: Allocating nested domain with parent domid=%#x\n",
+ __func__, to_pdomain(parent)->id);
+
+ dom = do_iommu_domain_alloc(IOMMU_DOMAIN_NESTED, dev, flags);
+ if (IS_ERR(dom))
+ return ERR_PTR(-ENOMEM);
+
+ pdom = to_pdomain(dom);
+ ret = nested_gcr3_update(hwpt, pdom, to_pdomain(parent), dev);
+ if (ret)
+ goto err_out;
+
+ return dom;
+
+err_out:
+ iommu_domain_free(dom);
+ return ERR_PTR(-EINVAL);
+}
--
2.34.1