[PATCH net-next 01/17] net: Copy slab data for sendmsg(MSG_SPLICE_PAGES)

From: David Howells
Date: Fri Jun 16 2023 - 12:14:14 EST


If sendmsg() is passed MSG_SPLICE_PAGES and is given a buffer that contains
some data that's resident in the slab, copy it rather than returning EIO.
This can be made use of by a number of drivers in the kernel, including:
iwarp, ceph/rds, dlm, nvme, ocfs2, drdb. It could also be used by iscsi,
rxrpc, sunrpc, cifs and probably others.

skb_splice_from_iter() is given it's own fragment allocator as
page_frag_alloc_align() can't be used because it does no locking to prevent
parallel callers from racing. alloc_skb_frag() uses a separate folio for
each cpu and locks to the cpu whilst allocating, reenabling cpu migration
around folio allocation.

This could allocate a whole page instead for each fragment to be copied, as
alloc_skb_with_frags() would do instead, but that would waste a lot of
space (most of the fragments look like they're going to be small).

This allows an entire message that consists of, say, a protocol header or
two, a number of pages of data and a protocol footer to be sent using a
single call to sock_sendmsg().

The callers could be made to copy the data into fragments before calling
sendmsg(), but that then penalises them if MSG_SPLICE_PAGES gets ignored.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Alexander Duyck <alexander.duyck@xxxxxxxxx>
cc: Eric Dumazet <edumazet@xxxxxxxxxx>
cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
cc: David Ahern <dsahern@xxxxxxxxxx>
cc: Jakub Kicinski <kuba@xxxxxxxxxx>
cc: Paolo Abeni <pabeni@xxxxxxxxxx>
cc: Jens Axboe <axboe@xxxxxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: Menglong Dong <imagedong@xxxxxxxxxxx>
cc: netdev@xxxxxxxxxxxxxxx
---
include/linux/skbuff.h | 5 ++
net/core/skbuff.c | 172 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 91ed66952580..0ba776cd9be8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5037,6 +5037,11 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
#endif
}

+void *alloc_skb_frag(size_t fragsz, gfp_t gfp);
+void *copy_skb_frag(const void *s, size_t len, gfp_t gfp);
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+ ssize_t maxsize, gfp_t gfp);
+
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp);

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fee2b1c105fe..9bd8d6bf6c21 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6755,6 +6755,146 @@ nodefer: __kfree_skb(skb);
smp_call_function_single_async(cpu, &sd->defer_csd);
}

+struct skb_splice_frag_cache {
+ struct folio *folio;
+ void *virt;
+ unsigned int offset;
+ /* we maintain a pagecount bias, so that we dont dirty cache line
+ * containing page->_refcount every time we allocate a fragment.
+ */
+ unsigned int pagecnt_bias;
+ bool pfmemalloc;
+};
+
+static DEFINE_PER_CPU(struct skb_splice_frag_cache, skb_splice_frag_cache);
+
+/**
+ * alloc_skb_frag - Allocate a page fragment for using in a socket
+ * @fragsz: The size of fragment required
+ * @gfp: Allocation flags
+ */
+void *alloc_skb_frag(size_t fragsz, gfp_t gfp)
+{
+ struct skb_splice_frag_cache *cache;
+ struct folio *folio, *spare = NULL;
+ size_t offset, fsize;
+ void *p;
+
+ if (WARN_ON_ONCE(fragsz == 0))
+ fragsz = 1;
+
+ cache = get_cpu_ptr(&skb_splice_frag_cache);
+reload:
+ folio = cache->folio;
+ offset = cache->offset;
+try_again:
+ if (fragsz > offset)
+ goto insufficient_space;
+
+ /* Make the allocation. */
+ cache->pagecnt_bias--;
+ offset = ALIGN_DOWN(offset - fragsz, SMP_CACHE_BYTES);
+ cache->offset = offset;
+ p = cache->virt + offset;
+ put_cpu_ptr(skb_splice_frag_cache);
+ if (spare)
+ folio_put(spare);
+ return p;
+
+insufficient_space:
+ /* See if we can refurbish the current folio. */
+ if (!folio || !folio_ref_sub_and_test(folio, cache->pagecnt_bias))
+ goto get_new_folio;
+ if (unlikely(cache->pfmemalloc)) {
+ __folio_put(folio);
+ goto get_new_folio;
+ }
+
+ fsize = folio_size(folio);
+ if (unlikely(fragsz > fsize))
+ goto frag_too_big;
+
+ /* OK, page count is 0, we can safely set it */
+ folio_set_count(folio, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+ /* Reset page count bias and offset to start of new frag */
+ cache->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ offset = fsize;
+ goto try_again;
+
+get_new_folio:
+ if (!spare) {
+ cache->folio = NULL;
+ put_cpu_ptr(&skb_splice_frag_cache);
+
+#if PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE
+ spare = folio_alloc(gfp | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC,
+ PAGE_FRAG_CACHE_MAX_ORDER);
+ if (!spare)
+#endif
+ spare = folio_alloc(gfp, 0);
+ if (!spare)
+ return NULL;
+
+ cache = get_cpu_ptr(&skb_splice_frag_cache);
+ /* We may now be on a different cpu and/or someone else may
+ * have refilled it
+ */
+ cache->pfmemalloc = folio_is_pfmemalloc(spare);
+ if (cache->folio)
+ goto reload;
+ }
+
+ cache->folio = spare;
+ cache->virt = folio_address(spare);
+ folio = spare;
+ spare = NULL;
+
+ /* Even if we own the page, we do not use atomic_set(). This would
+ * break get_page_unless_zero() users.
+ */
+ folio_ref_add(folio, PAGE_FRAG_CACHE_MAX_SIZE);
+
+ /* Reset page count bias and offset to start of new frag */
+ cache->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ offset = folio_size(folio);
+ goto try_again;
+
+frag_too_big:
+ /*
+ * The caller is trying to allocate a fragment with fragsz > PAGE_SIZE
+ * but the cache isn't big enough to satisfy the request, this may
+ * happen in low memory conditions. We don't release the cache page
+ * because it could make memory pressure worse so we simply return NULL
+ * here.
+ */
+ cache->offset = offset;
+ put_cpu_ptr(&skb_splice_frag_cache);
+ if (spare)
+ folio_put(spare);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_frag);
+
+/**
+ * copy_skb_frag - Copy data into a page fragment.
+ * @s: The data to copy
+ * @len: The size of the data
+ * @gfp: Allocation flags
+ */
+void *copy_skb_frag(const void *s, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = alloc_skb_frag(len, gfp);
+ if (!p)
+ return NULL;
+
+ return memcpy(p, s, len);
+}
+EXPORT_SYMBOL(copy_skb_frag);
+
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
size_t offset, size_t len)
{
@@ -6808,17 +6948,43 @@ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
break;
}

+ if (space == 0 &&
+ !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags,
+ pages[0], off)) {
+ iov_iter_revert(iter, len);
+ break;
+ }
+
i = 0;
do {
struct page *page = pages[i++];
size_t part = min_t(size_t, PAGE_SIZE - off, len);
-
- ret = -EIO;
- if (WARN_ON_ONCE(!sendpage_ok(page)))
+ bool put = false;
+
+ if (PageSlab(page)) {
+ const void *p;
+ void *q;
+
+ p = kmap_local_page(page);
+ q = copy_skb_frag(p + off, part, gfp);
+ kunmap_local(p);
+ if (!q) {
+ iov_iter_revert(iter, len);
+ ret = -ENOMEM;
+ goto out;
+ }
+ page = virt_to_page(q);
+ off = offset_in_page(q);
+ put = true;
+ } else if (WARN_ON_ONCE(!sendpage_ok(page))) {
+ ret = -EIO;
goto out;
+ }

ret = skb_append_pagefrags(skb, page, off, part,
frag_limit);
+ if (put)
+ put_page(page);
if (ret < 0) {
iov_iter_revert(iter, len);
goto out;