Re: [linus:master] [iov_iter] c9eec08bac: vm-scalability.throughput -16.9% regression

From: David Howells
Date: Mon Nov 20 2023 - 08:33:10 EST


Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:

> So I don't think we should use either of these benchmarks as a "we
> need to optimize for *this*", but it is another example of how much
> memcpy() does matter. Even if the end result is then "but different
> microarchitectrues react so differently that we can't please
> everybody".

So what, if anything, should I change? Should I make it directly call
__memcpy? Or should we just leave it to the compiler? I would prefer to
leave memcpy_from_iter() and memcpy_to_iter() as __always_inline to eliminate
the function pointer call we otherwise end up with and to eliminate the return
value (which is always 0 in this case).

How about something along the attached lines? (With the inline asm
appropriate pushed out to an arch header file).

On the other hand, it might be better to have memcpy_to/from_iter() just call
__memcpy() as it doesn't seem to make much difference to the time taken and
the inline func can still return a constant 0 return value that can be
optimised away.

David
---

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 0ae2e1712e2e..7354982dc433 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -43,7 +43,7 @@ EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)

-SYM_FUNC_START_LOCAL(memcpy_orig)
+SYM_TYPED_FUNC_START(memcpy_orig)
movq %rdi, %rax

cmpq $0x20, %rdx
@@ -169,4 +169,4 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
.Lend:
RET
SYM_FUNC_END(memcpy_orig)
-
+EXPORT_SYMBOL(memcpy_orig)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index de7d11cf4c63..de73edb9ffcc 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -62,7 +62,17 @@ static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
size_t len, void *from, void *priv2)
{
- memcpy(iter_to, from + progress, len);
+ size_t len2 = len;
+ from += progress;
+ /*
+ * If CPU has FSRM feature, use 'rep movs'.
+ * Otherwise, use rep_movs_alternative.
+ */
+ asm volatile(
+ ALTERNATIVE("rep movsb",
+ "call memcpy_orig", ALT_NOT(X86_FEATURE_FSRM))
+ :"+D" (iter_to), "+S" (from), "+d" (len), "+c"(len2), ASM_CALL_CONSTRAINT
+ :: "memory", "rax", "r8", "r9", "r10", "r11");
return 0;
}

@@ -70,7 +80,18 @@ static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
size_t len, void *to, void *priv2)
{
- memcpy(to + progress, iter_from, len);
+ size_t len2 = len;
+ to += progress;
+ /*
+ * If CPU has FSRM feature, use 'rep movs'.
+ * Otherwise, use rep_movs_alternative.
+ */
+ asm volatile(
+ ALTERNATIVE("rep movsb",
+ "call memcpy_orig",
+ ALT_NOT(X86_FEATURE_FSRM))
+ :"+D" (to), "+S" (iter_from), "+d" (len), "+c" (len2), ASM_CALL_CONSTRAINT
+ :: "memory", "rax", "r8", "r9", "r10", "r11");
return 0;
}