Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations

From: Robin Murphy
Date: Mon Nov 14 2022 - 08:52:24 EST


On 2022-11-14 13:30, Robin Murphy wrote:
On 2022-11-12 04:04, Eric Dumazet wrote:
Quite often, NIC devices do not need dma_sync operations
on x86_64 at least.

Indeed, when dev_is_dma_coherent(dev) is true and
dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
and friends do nothing.

However, indirectly calling them when CONFIG_RETPOLINE=y
consumes about 10% of cycles on a cpu receiving packets
from softirq at ~100Gbit rate, as shown in [1]

Even if/when CONFIG_RETPOLINE is not set, there
is a cost of about 3%.

This patch adds a copy of iommu_dma_ops structure,
where sync_single_for_cpu, sync_single_for_device,
sync_sg_for_cpu and sync_sg_for_device are unset.

TBH I reckon it might be worthwhile to add another top-level bitfield to struct device to indicate when syncs can be optimised out completely, so we can handle it at the DMA API dispatch level and short-circuit a bit more of the dma-direct path too.

perf profile before the patch:

     18.53%  [kernel]       [k] gq_rx_skb
     14.77%  [kernel]       [k] napi_reuse_skb
      8.95%  [kernel]       [k] skb_release_data
      5.42%  [kernel]       [k] dev_gro_receive
      5.37%  [kernel]       [k] memcpy
<*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
      4.78%  [kernel]       [k] tcp_gro_receive
<*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
      4.12%  [kernel]       [k] ipv6_gro_receive
      3.65%  [kernel]       [k] gq_pool_get
      3.25%  [kernel]       [k] skb_gro_receive
      2.07%  [kernel]       [k] napi_gro_frags
      1.98%  [kernel]       [k] tcp6_gro_receive
      1.27%  [kernel]       [k] gq_rx_prep_buffers
      1.18%  [kernel]       [k] gq_rx_napi_handler
      0.99%  [kernel]       [k] csum_partial
      0.74%  [kernel]       [k] csum_ipv6_magic
      0.72%  [kernel]       [k] free_pcp_prepare
      0.60%  [kernel]       [k] __napi_poll
      0.58%  [kernel]       [k] net_rx_action
      0.56%  [kernel]       [k] read_tsc
<*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
      0.45%  [kernel]       [k] memset

After patch, lines with <*> no longer show up, and overall
cpu usage looks much better (~60% instead of ~72%)

     25.56%  [kernel]       [k] gq_rx_skb
      9.90%  [kernel]       [k] napi_reuse_skb
      7.39%  [kernel]       [k] dev_gro_receive
      6.78%  [kernel]       [k] memcpy
      6.53%  [kernel]       [k] skb_release_data
      6.39%  [kernel]       [k] tcp_gro_receive
      5.71%  [kernel]       [k] ipv6_gro_receive
      4.35%  [kernel]       [k] napi_gro_frags
      4.34%  [kernel]       [k] skb_gro_receive
      3.50%  [kernel]       [k] gq_pool_get
      3.08%  [kernel]       [k] gq_rx_napi_handler
      2.35%  [kernel]       [k] tcp6_gro_receive
      2.06%  [kernel]       [k] gq_rx_prep_buffers
      1.32%  [kernel]       [k] csum_partial
      0.93%  [kernel]       [k] csum_ipv6_magic
      0.65%  [kernel]       [k] net_rx_action

Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx>
Cc: Robin Murphy <robin.murphy@xxxxxxx>
Cc: Joerg Roedel <joro@xxxxxxxxxx>
Cc: Will Deacon <will@xxxxxxxxxx>
Cc: iommu@xxxxxxxxxxxxxxx
---
  drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
  1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
      return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
  }
+static bool dev_is_dma_sync_needed(struct device *dev)
+{
+    return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
+}
+
  /**
   * iommu_dma_init_domain - Initialise a DMA mapping domain
   * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
  {
      phys_addr_t phys;
-    if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
+    if (!dev_is_dma_sync_needed(dev))
          return;
      phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
@@ -930,7 +935,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
  {
      phys_addr_t phys;
-    if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
+    if (!dev_is_dma_sync_needed(dev))
          return;
      phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
@@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
      return iova_rcache_range();
  }
+#define iommu_dma_ops_common_fields \
+    .flags            = DMA_F_PCI_P2PDMA_SUPPORTED,        \
+    .alloc            = iommu_dma_alloc,            \
+    .free            = iommu_dma_free,            \
+    .alloc_pages        = dma_common_alloc_pages,        \
+    .free_pages        = dma_common_free_pages,        \
+    .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,    \
+    .free_noncontiguous    = iommu_dma_free_noncontiguous,        \
+    .mmap            = iommu_dma_mmap,            \
+    .get_sgtable        = iommu_dma_get_sgtable,        \
+    .map_page        = iommu_dma_map_page,            \
+    .unmap_page        = iommu_dma_unmap_page,            \
+    .map_sg            = iommu_dma_map_sg,            \
+    .unmap_sg        = iommu_dma_unmap_sg,            \
+    .map_resource        = iommu_dma_map_resource,        \
+    .unmap_resource        = iommu_dma_unmap_resource,        \
+    .get_merge_boundary    = iommu_dma_get_merge_boundary,        \
+    .opt_mapping_size    = iommu_dma_opt_mapping_size,
+
  static const struct dma_map_ops iommu_dma_ops = {
-    .flags            = DMA_F_PCI_P2PDMA_SUPPORTED,
-    .alloc            = iommu_dma_alloc,
-    .free            = iommu_dma_free,
-    .alloc_pages        = dma_common_alloc_pages,
-    .free_pages        = dma_common_free_pages,
-    .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,
-    .free_noncontiguous    = iommu_dma_free_noncontiguous,
-    .mmap            = iommu_dma_mmap,
-    .get_sgtable        = iommu_dma_get_sgtable,
-    .map_page        = iommu_dma_map_page,
-    .unmap_page        = iommu_dma_unmap_page,
-    .map_sg            = iommu_dma_map_sg,
-    .unmap_sg        = iommu_dma_unmap_sg,
+    iommu_dma_ops_common_fields
+
      .sync_single_for_cpu    = iommu_dma_sync_single_for_cpu,
      .sync_single_for_device    = iommu_dma_sync_single_for_device,
      .sync_sg_for_cpu    = iommu_dma_sync_sg_for_cpu,
      .sync_sg_for_device    = iommu_dma_sync_sg_for_device,
-    .map_resource        = iommu_dma_map_resource,
-    .unmap_resource        = iommu_dma_unmap_resource,
-    .get_merge_boundary    = iommu_dma_get_merge_boundary,
-    .opt_mapping_size    = iommu_dma_opt_mapping_size,
  };
+/* Special instance of iommu_dma_ops for devices satisfying this condition:
+ *   !dev_is_dma_sync_needed(dev)
+ *
+ * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
+ * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
+ * do nothing special and can be avoided, saving indirect calls.
+ */
+static const struct dma_map_ops iommu_nosync_dma_ops = {
+    iommu_dma_ops_common_fields
+
+    .sync_single_for_cpu    = NULL,
+    .sync_single_for_device    = NULL,
+    .sync_sg_for_cpu    = NULL,
+    .sync_sg_for_device    = NULL,
+};
+#undef iommu_dma_ops_common_fields
+
  /*
   * The IOMMU core code allocates the default DMA domain, which the underlying
   * IOMMU driver needs to support via the dma-iommu layer.
@@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
      if (iommu_is_dma_domain(domain)) {
          if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
              goto out_err;
-        dev->dma_ops = &iommu_dma_ops;
+        dev->dma_ops = dev_is_dma_sync_needed(dev) ?
+                &iommu_dma_ops : &iommu_nosync_dma_ops;

This doesn't work, because at this point we don't know whether a coherent device is still going to need SWIOTLB for DMA mask reasons or not.

Wait, no, now I've completely confused myself... :(

This probably *is* OK since it's specifically iommu_dma_ops, not DMA ops in general, and we don't support IOMMUs with addressing limitations of their own. Plus the other reasons for hooking into SWIOTLB here that have also muddled my brain have been for non-coherent stuff, so still probably shouldn't make a difference.

Either way I do think it would be neatest to handle this higher up in the API (not to mention apparently easier to reason about...)

Thanks,
Robin.