x86: should clear_user() have alternatives?

From: Hugh Dickins
Date: Tue Feb 08 2022 - 00:45:56 EST


Hi Boris,

I've noticed that clear_user() is slower than it need be:

dd if=/dev/zero of=/dev/null bs=1M count=1M
1099511627776 bytes (1.1 TB) copied, 45.9641 s, 23.9 GB/s
whereas with the hacked patch below
1099511627776 bytes (1.1 TB) copied, 33.4 s, 32.9 GB/s

That was on some Intel machine: IIRC an AMD went faster.

It's because clear_user() lacks alternatives, and uses a
nowadays suboptimal implementation; whereas clear_page()
and copy_user() do support alternatives.

I came across this because reading a sparse file on tmpfs uses
copy_page_to_iter() from the ZERO_PAGE(), and I wanted to change that
to iov_iter_zero(), which sounds obviously faster - but in fact slower.

I realize that dd'ing from /dev/zero to /dev/null, and sparse files on
tmpfs, are not prime candidates for optimization; and I've no idea how
much clear_user() normally gets used for long clears.

If I were capable of compiler asm, with alternatives, and knew at what
length ERMS becomes advantageous when clearing, I would be sending you
a proper patch. As it is, I'm hoping to tempt someone else to do the
work! Or reject it as too niche to bother with.

Thanks,
Hugh

Hacked patch against 5.16 (5.17-rc is a little different there):

arch/x86/lib/copy_user_64.S | 26 ++++++++++++++++++++++++++
arch/x86/lib/usercopy_64.c | 36 +-----------------------------------
2 files changed, 27 insertions(+), 35 deletions(-)

--- 5.16/arch/x86/lib/copy_user_64.S 2022-01-09 14:55:34.000000000 -0800
+++ erms/arch/x86/lib/copy_user_64.S 2022-01-22 16:57:21.156968363 -0800
@@ -392,3 +392,29 @@ SYM_FUNC_START(__copy_user_nocache)
_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
+
+/*
+ * Recent CPUs have added enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ * Assume that's best for __clear_user(), until alternatives are provided
+ * (though would be better to avoid REP STOSB for short clears, if no FSRM).
+ *
+ * Input:
+ * rdi destination
+ * rsi count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__clear_user)
+ ASM_STAC
+ movl %esi,%ecx
+ xorq %rax,%rax
+1: rep stosb
+2: movl %ecx,%eax
+ ASM_CLAC
+ ret
+
+ _ASM_EXTABLE_UA(1b, 2b)
+SYM_FUNC_END(__clear_user)
+EXPORT_SYMBOL(__clear_user)
--- 5.16/arch/x86/lib/usercopy_64.c 2020-12-13 14:41:30.000000000 -0800
+++ erms/arch/x86/lib/usercopy_64.c 2022-01-20 18:40:04.125752474 -0800
@@ -13,43 +13,9 @@
/*
* Zero Userspace
*/
-
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
- long __d0;
- might_fault();
- /* no memory constraint because it doesn't change any memory gcc knows
- about */
- stac();
- asm volatile(
- " testq %[size8],%[size8]\n"
- " jz 4f\n"
- " .align 16\n"
- "0: movq $0,(%[dst])\n"
- " addq $8,%[dst]\n"
- " decl %%ecx ; jnz 0b\n"
- "4: movq %[size1],%%rcx\n"
- " testl %%ecx,%%ecx\n"
- " jz 2f\n"
- "1: movb $0,(%[dst])\n"
- " incq %[dst]\n"
- " decl %%ecx ; jnz 1b\n"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: lea 0(%[size1],%[size8],8),%[size8]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 3b)
- _ASM_EXTABLE_UA(1b, 2b)
- : [size8] "=&c"(size), [dst] "=&D" (__d0)
- : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
- clac();
- return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
unsigned long clear_user(void __user *to, unsigned long n)
{
+ might_fault();
if (access_ok(to, n))
return __clear_user(to, n);
return n;