Re: [GIT PULL] x86/misc for 6.5

From: Linus Torvalds
Date: Tue Jun 27 2023 - 17:45:22 EST


On Tue, 27 Jun 2023 at 13:49, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Anyway, before I throw my patch away, I'll just post it with the
> trivial fixes to use "+r", and with the "volatile" removed (I add
> "volatile" to asms by habit, but this one really isn't volatile).

Oh, never mind. I was about to throw it away, and then I realized that
the code *after* the loop relied on the range having been reduced down
to below 64 bytes, and checked for 32/16/8/4 byte ranges.

And my change to make it loop over 80 bytes had made that no longer be true.

But now I'm committed, and decided to fix that too, and just
re-organize the code to get all the cases right.

And now I'm going to actually boot-test the end result too. Because
life is too short to spend all my time _just_ with merging.

Linus
From 7ac1b9952815d04d1906cc6dd01bcf742d6a6893 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Tue, 27 Jun 2023 13:55:32 -0700
Subject: [PATCH] Silly csum improvement. Maybe.

---
arch/x86/lib/csum-partial_64.c | 83 ++++++++++++++++------------------
1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index cea25ca8b8cf..d96e1da6604a 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -33,6 +33,20 @@ static inline __wsum csum_tail(u64 temp64, int odd)
return (__force __wsum)result;
}

+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]));
+ return sum;
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -59,52 +73,31 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
buff++;
}

- /*
- * len == 40 is the hot case due to IPv6 headers, but annotating it likely()
- * has noticeable negative affect on codegen for all other cases with
- * minimal performance benefit here.
- */
- if (len == 40) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r"(temp64)
- : [src] "r"(buff), "m"(*(const char(*)[40])buff));
- return csum_tail(temp64, odd);
- }
- if (unlikely(len >= 64)) {
- /*
- * Extra accumulators for better ILP in the loop.
- */
- u64 tmp_accum, tmp_carries;
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);

- asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
- "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
- "subl $64, %[len]\n\t"
- "1:\n\t"
- "addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq 4*8(%[src]),%[tmp_accum]\n\t"
- "adcq 5*8(%[src]),%[tmp_accum]\n\t"
- "adcq 6*8(%[src]),%[tmp_accum]\n\t"
- "adcq 7*8(%[src]),%[tmp_accum]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq $64, %[src]\n\t"
- "subl $64, %[len]\n\t"
- "jge 1b\n\t"
- "addq %[tmp_accum],%[res]\n\t"
- "adcq %[tmp_carries],%[res]\n\t"
- "adcq $0,%[res]"
- : [tmp_accum] "=&r"(tmp_accum),
- [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
- [len] "+r"(len), [src] "+r"(buff)
- : "m"(*(const char *)buff));
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (temp64): "r" (temp64_2));
+ }
+
+ /*
+ * len == 40 is the hot case due to IPv6 headers, so return
+ * early for that exact case without checking the tail bytes.
+ */
+ if (len >= 40) {
+ temp64 = update_csum_40b(temp64, buff);
+ len -= 40;
+ if (!len)
+ return csum_tail(temp64, odd);
+ buff += 40;
}

if (len & 32) {
--
2.41.0.203.ga4f2cd32bb.dirty