Re: [linus:master] [iov_iter] c9eec08bac: vm-scalability.throughput -16.9% regression

From: David Howells
Date: Thu Nov 16 2023 - 16:14:01 EST


Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:

> You could try building the kernel without mitigations (or booting with them
> off, which isn't quite as good) to verify.

Okay, I disabled RETPOLINE, which seems like it should be the important one.
With inlined memcpy:

iov_kunit_benchmark_bvec: avg 3160 uS, stddev 17 uS
iov_kunit_benchmark_bvec_split: avg 3380 uS, stddev 29 uS
iov_kunit_benchmark_kvec: avg 2940 uS, stddev 978 uS
iov_kunit_benchmark_xarray: avg 3599 uS, stddev 8 uS
iov_kunit_benchmark_xarray_to_bvec: avg 3964 uS, stddev 16 uS

Directly calling __memcpy():

iov_kunit_benchmark_bvec: avg 9947 uS, stddev 61 uS
iov_kunit_benchmark_bvec_split: avg 9790 uS, stddev 13 uS
iov_kunit_benchmark_kvec: avg 9565 uS, stddev 758 uS
iov_kunit_benchmark_xarray: avg 10498 uS, stddev 24 uS
iov_kunit_benchmark_xarray_to_bvec: avg 10459 uS, stddev 188 uS

I created a duplicate of __memcpy() (called __movsb_memcpy) without the
"alternative" statement and made it call that:

iov_kunit_benchmark_bvec: avg 3177 uS, stddev 7 uS
iov_kunit_benchmark_bvec_split: avg 3393 uS, stddev 10 uS
iov_kunit_benchmark_kvec: avg 2813 uS, stddev 385 uS
iov_kunit_benchmark_xarray: avg 3651 uS, stddev 7 uS
iov_kunit_benchmark_xarray_to_bvec: avg 3946 uS, stddev 8 uS

And then I made it call memcpy_orig() directly:

iov_kunit_benchmark_bvec: avg 9942 uS, stddev 17 uS
iov_kunit_benchmark_bvec_split: avg 9802 uS, stddev 29 uS
iov_kunit_benchmark_kvec: avg 9547 uS, stddev 598 uS
iov_kunit_benchmark_xarray: avg 10486 uS, stddev 13 uS
iov_kunit_benchmark_xarray_to_bvec: avg 10438 uS, stddev 12 uS

(See attached patch)

David
---
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 0ae2e1712e2e..df1ebbe345e2 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -43,7 +43,7 @@ EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)

-SYM_FUNC_START_LOCAL(memcpy_orig)
+SYM_TYPED_FUNC_START(memcpy_orig)
movq %rdi, %rax

cmpq $0x20, %rdx
@@ -169,4 +169,12 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
.Lend:
RET
SYM_FUNC_END(memcpy_orig)
+EXPORT_SYMBOL(memcpy_orig)

+SYM_TYPED_FUNC_START(__movsb_memcpy)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ rep movsb
+ RET
+SYM_FUNC_END(__movsb_memcpy)
+EXPORT_SYMBOL(__movsb_memcpy)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index de7d11cf4c63..620cd6356a5b 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -58,11 +58,18 @@ size_t copy_from_user_iter(void __user *iter_from, size_t progress,
return res;
}

+extern void *__movsb_memcpy(void *, const void *, size_t);
+extern void *memcpy_orig(void *, const void *, size_t);
+
static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
size_t len, void *from, void *priv2)
{
- memcpy(iter_to, from + progress, len);
+#if 0
+ __movsb_memcpy(iter_to, from + progress, len);
+#else
+ memcpy_orig(iter_to, from + progress, len);
+#endif
return 0;
}

@@ -70,7 +77,11 @@ static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
size_t len, void *to, void *priv2)
{
- memcpy(to + progress, iter_from, len);
+#if 0
+ __movsb_memcpy(to + progress, iter_from, len);
+#else
+ memcpy_orig(to + progress, iter_from, len);
+#endif
return 0;
}