Re: [GIT PULL] x86/misc for 6.5

From: Linus Torvalds
Date: Tue Jun 27 2023 - 16:11:39 EST


On Tue, 27 Jun 2023 at 04:00, Borislav Petkov <bp@xxxxxxxxx> wrote:
>
> - Improve csum_partial()'s performance

Honestly, looking at that patch, my reaction is "why did it get
unrolled in 64-byte chunks, if 40 bytes is the magic value"?

Particularly when there is then that "do a carry op each 32 bytes to
make 32-byte chunks independent and increase ILP". So even the 64-byte
case isn't *actuall* doing a 64-byte unrolling, it's really doing two
32-byte unrollings in parallel.

So you have three "magic" values, and the only one that really matters
is likely the 40-byte one.

Yes, yes, 64 bytes is the usual cacheline size, and is "traditional"
for unrolling. But there's nothing really magical about it here.

End result: wouldn't it have been nice to just do 40-byte chunks, and
make the 64-byte "two overlapping 32-byte chunks" be two of the
40-byte chunks.

Something like the (ENTIRELY UNTESTED!) attached patch?

Again: this is *not* tested. I took a quick look at the generated
assembly, and it looked roughly like what I expected it to look like,
but it may be complete garbage.

I added a couple of "likely()" things just because it made the
generated asm look more natural (ie it followed the order of the
source code there), they are otherwise questionable annotations.

Finally: did I already mention that this is completely untested?

Linus
arch/x86/lib/csum-partial_64.c | 66 ++++++++++++++++++------------------------
1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index cea25ca8b8cf..9f1fa47ccbe0 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -33,6 +33,20 @@ static inline __wsum csum_tail(u64 temp64, int odd)
return (__force __wsum)result;
}

+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+ asm volatile("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"=r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]), "0" (sum));
+ return sum;
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -64,47 +78,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
* has noticeable negative affect on codegen for all other cases with
* minimal performance benefit here.
*/
- if (len == 40) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r"(temp64)
- : [src] "r"(buff), "m"(*(const char(*)[40])buff));
+ if (likely(len == 40)) {
+ temp64 = update_csum_40b(temp64, buff);
return csum_tail(temp64, odd);
}
- if (unlikely(len >= 64)) {
- /*
- * Extra accumulators for better ILP in the loop.
- */
- u64 tmp_accum, tmp_carries;

- asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
- "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
- "subl $64, %[len]\n\t"
- "1:\n\t"
- "addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq 4*8(%[src]),%[tmp_accum]\n\t"
- "adcq 5*8(%[src]),%[tmp_accum]\n\t"
- "adcq 6*8(%[src]),%[tmp_accum]\n\t"
- "adcq 7*8(%[src]),%[tmp_accum]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq $64, %[src]\n\t"
- "subl $64, %[len]\n\t"
- "jge 1b\n\t"
- "addq %[tmp_accum],%[res]\n\t"
- "adcq %[tmp_carries],%[res]\n\t"
- "adcq $0,%[res]"
- : [tmp_accum] "=&r"(tmp_accum),
- [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
- [len] "+r"(len), [src] "+r"(buff)
- : "m"(*(const char *)buff));
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"=r" (temp64): "r" (temp64_2));
}

if (len & 32) {