Re: [PATCH net-next 09/11] sfc: implement iova rbtree to store dma mappings

From: Jason Wang
Date: Wed Dec 14 2022 - 01:48:06 EST


On Wed, Dec 7, 2022 at 10:57 PM Gautam Dawar <gautam.dawar@xxxxxxx> wrote:
>
> sfc uses a MCDI DMA buffer that is allocated on the host
> for communicating with the Firmware. The MCDI buffer IOVA
> could overlap with the IOVA used by the guest for the
> virtqueue buffers. To detect such overlap, the DMA mappings
> from the guest will be stored in a IOVA rbtree and every
> such mapping will be compared against the MCDI buffer IOVA
> range. If an overlap is detected, the MCDI buffer will be
> relocated to a different IOVA.

I think it can't prevent guests from guessing the MCDI buffer address
and trying to DMA to/from that buffer.

It might work with some co-operation of the NIC who can validate the
DMA initialized by the virtqueue and forbid the DMA to the MDCI
buffer.

Thanks


>
> Signed-off-by: Gautam Dawar <gautam.dawar@xxxxxxx>
> ---
> drivers/net/ethernet/sfc/Makefile | 3 +-
> drivers/net/ethernet/sfc/ef100_iova.c | 205 ++++++++++++++++++++++
> drivers/net/ethernet/sfc/ef100_iova.h | 40 +++++
> drivers/net/ethernet/sfc/ef100_nic.c | 1 -
> drivers/net/ethernet/sfc/ef100_vdpa.c | 38 ++++
> drivers/net/ethernet/sfc/ef100_vdpa.h | 15 ++
> drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 5 +
> drivers/net/ethernet/sfc/mcdi.h | 3 +
> 8 files changed, 308 insertions(+), 2 deletions(-)
> create mode 100644 drivers/net/ethernet/sfc/ef100_iova.c
> create mode 100644 drivers/net/ethernet/sfc/ef100_iova.h
>
> diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile
> index a10eac91ab23..85852ff50b7c 100644
> --- a/drivers/net/ethernet/sfc/Makefile
> +++ b/drivers/net/ethernet/sfc/Makefile
> @@ -11,7 +11,8 @@ sfc-$(CONFIG_SFC_MTD) += mtd.o
> sfc-$(CONFIG_SFC_SRIOV) += sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \
> mae.o tc.o tc_bindings.o tc_counters.o
>
> -sfc-$(CONFIG_SFC_VDPA) += mcdi_vdpa.o ef100_vdpa.o ef100_vdpa_ops.o
> +sfc-$(CONFIG_SFC_VDPA) += mcdi_vdpa.o ef100_vdpa.o ef100_vdpa_ops.o \
> + ef100_iova.o
> obj-$(CONFIG_SFC) += sfc.o
>
> obj-$(CONFIG_SFC_FALCON) += falcon/
> diff --git a/drivers/net/ethernet/sfc/ef100_iova.c b/drivers/net/ethernet/sfc/ef100_iova.c
> new file mode 100644
> index 000000000000..863314c5b9b5
> --- /dev/null
> +++ b/drivers/net/ethernet/sfc/ef100_iova.c
> @@ -0,0 +1,205 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Driver for Xilinx network controllers and boards
> + * Copyright(C) 2020-2022 Xilinx, Inc.
> + * Copyright(C) 2022 Advanced Micro Devices, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation, incorporated herein by reference.
> + */
> +
> +#include "ef100_iova.h"
> +
> +static void update_free_list_node(struct ef100_vdpa_iova_node *target_node,
> + struct ef100_vdpa_iova_node *next_node,
> + struct ef100_vdpa_nic *vdpa_nic)
> +{
> + unsigned long target_node_end;
> + unsigned long free_area;
> + bool in_list;
> +
> + target_node_end = target_node->iova + target_node->size;
> + free_area = next_node->iova - target_node_end;
> + in_list = !(list_empty(&target_node->free_node));
> +
> + if (!in_list && free_area >= MCDI_BUF_LEN) {
> + list_add(&target_node->free_node,
> + &vdpa_nic->free_list);
> + } else if (in_list && free_area < MCDI_BUF_LEN) {
> + list_del_init(&target_node->free_node);
> + }
> +}
> +
> +static void update_free_list(struct ef100_vdpa_iova_node *iova_node,
> + struct ef100_vdpa_nic *vdpa_nic,
> + bool add_node)
> +{
> + struct ef100_vdpa_iova_node *prev_in = NULL;
> + struct ef100_vdpa_iova_node *next_in = NULL;
> + struct rb_node *prev_node;
> + struct rb_node *next_node;
> +
> + prev_node = rb_prev(&iova_node->node);
> + next_node = rb_next(&iova_node->node);
> +
> + if (prev_node)
> + prev_in = rb_entry(prev_node,
> + struct ef100_vdpa_iova_node, node);
> + if (next_node)
> + next_in = rb_entry(next_node,
> + struct ef100_vdpa_iova_node, node);
> +
> + if (add_node) {
> + if (prev_in)
> + update_free_list_node(prev_in, iova_node, vdpa_nic);
> +
> + if (next_in)
> + update_free_list_node(iova_node, next_in, vdpa_nic);
> + } else {
> + if (next_in && prev_in)
> + update_free_list_node(prev_in, next_in, vdpa_nic);
> + if (!list_empty(&iova_node->free_node))
> + list_del_init(&iova_node->free_node);
> + }
> +}
> +
> +int efx_ef100_insert_iova_node(struct ef100_vdpa_nic *vdpa_nic,
> + u64 iova, u64 size)
> +{
> + struct ef100_vdpa_iova_node *iova_node;
> + struct ef100_vdpa_iova_node *new_node;
> + struct rb_node *parent;
> + struct rb_node **link;
> + struct rb_root *root;
> + int rc = 0;
> +
> + mutex_lock(&vdpa_nic->iova_lock);
> +
> + root = &vdpa_nic->iova_root;
> + link = &root->rb_node;
> + parent = *link;
> + /* Go to the bottom of the tree */
> + while (*link) {
> + parent = *link;
> + iova_node = rb_entry(parent, struct ef100_vdpa_iova_node, node);
> +
> + /* handle duplicate node */
> + if (iova_node->iova == iova) {
> + rc = -EEXIST;
> + goto out_unlock;
> + }
> +
> + if (iova_node->iova > iova)
> + link = &(*link)->rb_left;
> + else
> + link = &(*link)->rb_right;
> + }
> +
> + new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
> + if (!new_node) {
> + rc = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + new_node->iova = iova;
> + new_node->size = size;
> + INIT_LIST_HEAD(&new_node->free_node);
> +
> + /* Put the new node here */
> + rb_link_node(&new_node->node, parent, link);
> + rb_insert_color(&new_node->node, root);
> +
> + update_free_list(new_node, vdpa_nic, true);
> +
> +out_unlock:
> + mutex_unlock(&vdpa_nic->iova_lock);
> + return rc;
> +}
> +
> +static struct ef100_vdpa_iova_node*
> +ef100_rbt_search_node(struct ef100_vdpa_nic *vdpa_nic,
> + unsigned long iova)
> +{
> + struct ef100_vdpa_iova_node *iova_node;
> + struct rb_node *rb_node;
> + struct rb_root *root;
> +
> + root = &vdpa_nic->iova_root;
> + if (!root)
> + return NULL;
> +
> + rb_node = root->rb_node;
> +
> + while (rb_node) {
> + iova_node = rb_entry(rb_node, struct ef100_vdpa_iova_node,
> + node);
> + if (iova_node->iova > iova)
> + rb_node = rb_node->rb_left;
> + else if (iova_node->iova < iova)
> + rb_node = rb_node->rb_right;
> + else
> + return iova_node;
> + }
> +
> + return NULL;
> +}
> +
> +void efx_ef100_remove_iova_node(struct ef100_vdpa_nic *vdpa_nic,
> + unsigned long iova)
> +{
> + struct ef100_vdpa_iova_node *iova_node;
> +
> + mutex_lock(&vdpa_nic->iova_lock);
> + iova_node = ef100_rbt_search_node(vdpa_nic, iova);
> + if (!iova_node)
> + goto out_unlock;
> +
> + update_free_list(iova_node, vdpa_nic, false);
> +
> + rb_erase(&iova_node->node, &vdpa_nic->iova_root);
> + kfree(iova_node);
> +
> +out_unlock:
> + mutex_unlock(&vdpa_nic->iova_lock);
> +}
> +
> +void efx_ef100_delete_iova(struct ef100_vdpa_nic *vdpa_nic)
> +{
> + struct ef100_vdpa_iova_node *iova_node;
> + struct rb_root *iova_root;
> + struct rb_node *node;
> +
> + mutex_lock(&vdpa_nic->iova_lock);
> +
> + iova_root = &vdpa_nic->iova_root;
> + while (!RB_EMPTY_ROOT(iova_root)) {
> + node = rb_first(iova_root);
> + iova_node = rb_entry(node, struct ef100_vdpa_iova_node, node);
> + if (!list_empty(&iova_node->free_node))
> + list_del_init(&iova_node->free_node);
> + if (vdpa_nic->domain)
> + iommu_unmap(vdpa_nic->domain, iova_node->iova,
> + iova_node->size);
> + rb_erase(node, iova_root);
> + kfree(iova_node);
> + }
> +
> + mutex_unlock(&vdpa_nic->iova_lock);
> +}
> +
> +int efx_ef100_find_new_iova(struct ef100_vdpa_nic *vdpa_nic,
> + unsigned int buf_len,
> + u64 *new_iova)
> +{
> + struct ef100_vdpa_iova_node *iova_node;
> +
> + /* pick the first node from freelist */
> + iova_node = list_first_entry_or_null(&vdpa_nic->free_list,
> + struct ef100_vdpa_iova_node,
> + free_node);
> + if (!iova_node)
> + return -ENOENT;
> +
> + *new_iova = iova_node->iova + iova_node->size;
> + return 0;
> +}
> diff --git a/drivers/net/ethernet/sfc/ef100_iova.h b/drivers/net/ethernet/sfc/ef100_iova.h
> new file mode 100644
> index 000000000000..68e39c4152c7
> --- /dev/null
> +++ b/drivers/net/ethernet/sfc/ef100_iova.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Driver for Xilinx network controllers and boards
> + * Copyright(C) 2020-2022 Xilinx, Inc.
> + * Copyright(C) 2022 Advanced Micro Devices, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation, incorporated herein by reference.
> + */
> +#ifndef EFX_EF100_IOVA_H
> +#define EFX_EF100_IOVA_H
> +
> +#include "ef100_nic.h"
> +#include "ef100_vdpa.h"
> +
> +#if defined(CONFIG_SFC_VDPA)
> +/**
> + * struct ef100_vdpa_iova_node - guest buffer iova entry
> + *
> + * @node: red black tree node
> + * @iova: mapping's IO virtual address
> + * @size: length of mapped region in bytes
> + * @free_node: free list node
> + */
> +struct ef100_vdpa_iova_node {
> + struct rb_node node;
> + unsigned long iova;
> + size_t size;
> + struct list_head free_node;
> +};
> +
> +int efx_ef100_insert_iova_node(struct ef100_vdpa_nic *vdpa_nic,
> + u64 iova, u64 size);
> +void efx_ef100_remove_iova_node(struct ef100_vdpa_nic *vdpa_nic,
> + unsigned long iova);
> +void efx_ef100_delete_iova(struct ef100_vdpa_nic *vdpa_nic);
> +int efx_ef100_find_new_iova(struct ef100_vdpa_nic *vdpa_nic,
> + unsigned int buf_len, u64 *new_iova);
> +#endif /* CONFIG_SFC_VDPA */
> +#endif /* EFX_EF100_IOVA_H */
> diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c
> index 41811c519275..72820d2fe19d 100644
> --- a/drivers/net/ethernet/sfc/ef100_nic.c
> +++ b/drivers/net/ethernet/sfc/ef100_nic.c
> @@ -33,7 +33,6 @@
>
> #define EF100_MAX_VIS 4096
> #define EF100_NUM_MCDI_BUFFERS 1
> -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX)
>
> #define EF100_RESET_PORT ((ETH_RESET_MAC | ETH_RESET_PHY) << ETH_RESET_SHARED_SHIFT)
>
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c b/drivers/net/ethernet/sfc/ef100_vdpa.c
> index 80bca281a748..b9368eb1acd5 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
> @@ -14,6 +14,7 @@
> #include <uapi/linux/vdpa.h>
> #include "ef100_vdpa.h"
> #include "mcdi_vdpa.h"
> +#include "ef100_iova.h"
> #include "mcdi_filters.h"
> #include "mcdi_functions.h"
> #include "ef100_netdev.h"
> @@ -280,6 +281,34 @@ static int get_net_config(struct ef100_vdpa_nic *vdpa_nic)
> return 0;
> }
>
> +static int vdpa_update_domain(struct ef100_vdpa_nic *vdpa_nic)
> +{
> + struct vdpa_device *vdpa = &vdpa_nic->vdpa_dev;
> + struct iommu_domain_geometry *geo;
> + struct device *dma_dev;
> +
> + dma_dev = vdpa_get_dma_dev(vdpa);
> + if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY))
> + return -EOPNOTSUPP;
> +
> + vdpa_nic->domain = iommu_get_domain_for_dev(dma_dev);
> + if (!vdpa_nic->domain)
> + return -ENODEV;
> +
> + geo = &vdpa_nic->domain->geometry;
> + /* save the geo aperture range for validation in dma_map */
> + vdpa_nic->geo_aper_start = geo->aperture_start;
> +
> + /* Handle the boundary case */
> + if (geo->aperture_end == ~0ULL)
> + geo->aperture_end -= 1;
> + vdpa_nic->geo_aper_end = geo->aperture_end;
> +
> + /* insert a sentinel node */
> + return efx_ef100_insert_iova_node(vdpa_nic,
> + vdpa_nic->geo_aper_end + 1, 0);
> +}
> +
> static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
> const char *dev_name,
> enum ef100_vdpa_class dev_type,
> @@ -316,6 +345,7 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
> }
>
> mutex_init(&vdpa_nic->lock);
> + mutex_init(&vdpa_nic->iova_lock);
> dev = &vdpa_nic->vdpa_dev.dev;
> efx->vdpa_nic = vdpa_nic;
> vdpa_nic->vdpa_dev.dma_dev = &efx->pci_dev->dev;
> @@ -325,9 +355,11 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
> vdpa_nic->pf_index = nic_data->pf_index;
> vdpa_nic->vf_index = nic_data->vf_index;
> vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
> + vdpa_nic->iova_root = RB_ROOT;
> vdpa_nic->mac_address = (u8 *)&vdpa_nic->net_config.mac;
> ether_addr_copy(vdpa_nic->mac_address, mac);
> vdpa_nic->mac_configured = true;
> + INIT_LIST_HEAD(&vdpa_nic->free_list);
>
> for (i = 0; i < EF100_VDPA_MAC_FILTER_NTYPES; i++)
> vdpa_nic->filters[i].filter_id = EFX_INVALID_FILTER_ID;
> @@ -353,6 +385,12 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
> goto err_put_device;
> }
>
> + rc = vdpa_update_domain(vdpa_nic);
> + if (rc) {
> + pci_err(efx->pci_dev, "update_domain failed, err: %d\n", rc);
> + goto err_put_device;
> + }
> +
> rc = get_net_config(vdpa_nic);
> if (rc)
> goto err_put_device;
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h b/drivers/net/ethernet/sfc/ef100_vdpa.h
> index 1b0bbba88154..c3c77029973d 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
> @@ -12,7 +12,9 @@
> #define __EF100_VDPA_H__
>
> #include <linux/vdpa.h>
> +#include <linux/iommu.h>
> #include <uapi/linux/virtio_net.h>
> +#include <linux/rbtree.h>
> #include "net_driver.h"
> #include "ef100_nic.h"
>
> @@ -155,6 +157,12 @@ struct ef100_vdpa_filter {
> * @mac_configured: true after MAC address is configured
> * @filters: details of all filters created on this vdpa device
> * @cfg_cb: callback for config change
> + * @domain: IOMMU domain
> + * @iova_root: iova rbtree root
> + * @iova_lock: lock to synchronize updates to rbtree and freelist
> + * @free_list: list to store free iova areas of size >= MCDI buffer length
> + * @geo_aper_start: start of valid IOVA range
> + * @geo_aper_end: end of valid IOVA range
> */
> struct ef100_vdpa_nic {
> struct vdpa_device vdpa_dev;
> @@ -174,6 +182,13 @@ struct ef100_vdpa_nic {
> bool mac_configured;
> struct ef100_vdpa_filter filters[EF100_VDPA_MAC_FILTER_NTYPES];
> struct vdpa_callback cfg_cb;
> + struct iommu_domain *domain;
> + struct rb_root iova_root;
> + /* mutex to synchronize rbtree operations */
> + struct mutex iova_lock;
> + struct list_head free_list;
> + u64 geo_aper_start;
> + u64 geo_aper_end;
> };
>
> int ef100_vdpa_init(struct efx_probe_data *probe_data);
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> index 718b67f6da90..8c198d949fdb 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> @@ -10,6 +10,7 @@
>
> #include <linux/vdpa.h>
> #include "ef100_vdpa.h"
> +#include "ef100_iova.h"
> #include "io.h"
> #include "mcdi_vdpa.h"
>
> @@ -260,6 +261,7 @@ static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
> if (!vdpa_nic->status)
> return;
>
> + efx_ef100_delete_iova(vdpa_nic);
> vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
> vdpa_nic->status = 0;
> vdpa_nic->features = 0;
> @@ -743,9 +745,12 @@ static void ef100_vdpa_free(struct vdpa_device *vdev)
> int i;
>
> if (vdpa_nic) {
> + /* clean-up the mappings and iova tree */
> + efx_ef100_delete_iova(vdpa_nic);
> for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
> reset_vring(vdpa_nic, i);
> ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
> + mutex_destroy(&vdpa_nic->iova_lock);
> mutex_destroy(&vdpa_nic->lock);
> vdpa_nic->efx->vdpa_nic = NULL;
> }
> diff --git a/drivers/net/ethernet/sfc/mcdi.h b/drivers/net/ethernet/sfc/mcdi.h
> index db4ca4975ada..7d977a58a0df 100644
> --- a/drivers/net/ethernet/sfc/mcdi.h
> +++ b/drivers/net/ethernet/sfc/mcdi.h
> @@ -7,6 +7,7 @@
> #ifndef EFX_MCDI_H
> #define EFX_MCDI_H
>
> +#include "mcdi_pcol.h"
> /**
> * enum efx_mcdi_state - MCDI request handling state
> * @MCDI_STATE_QUIESCENT: No pending MCDI requests. If the caller holds the
> @@ -40,6 +41,8 @@ enum efx_mcdi_mode {
> MCDI_MODE_FAIL,
> };
>
> +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX)
> +
> /**
> * struct efx_mcdi_iface - MCDI protocol context
> * @efx: The associated NIC.
> --
> 2.30.1
>