Re: x86/csum: Remove unnecessary odd handling

From: Linus Torvalds
Date: Wed Jun 28 2023 - 13:45:24 EST


On Wed, 28 Jun 2023 at 08:32, Noah Goldstein <goldstein.w.n@xxxxxxxxx> wrote:
>
> Linus, if you're planning a patch and want to just integrate the codes
> here I'm happy drop this patch

No, that patch looks good to me.

In fact, I wasn't planning on integrating my patch at all. I literally
did it as a "I would have done it this way instead" exercise.

And while I am currently running with my patch in the kernel, I don't
even really know if it works and does the right thing. Maybe my use
doesn't even trigger csum_partial() at all. I did not do any testing
that "yes, I get the same checksum as a result".

So

(a) removing the pointless one-byte alignment looks good to me.

(b) I'd actually hope that somebody who _cares_ about this path and
has put some real work into it (as opposed to my "superficial
dabbling") would look at my patch and either go "yeah, not worth it",
or "looks good, I'll take it".

and I'm including that final patch of mine here again in case there
was any confusion with the earlier versions (there were at least two
known-broken versions I posted).

*If* somebody likes it, and verifies that the checksum result is
correct, feel free to do anything with that patch, including adding my
signed-off-by for it (or taking the credit all for yourself -
Mwahahahahaahaa!)

Linus
From 24a1d533d96074220927d844a619a54419b69b81 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Tue, 27 Jun 2023 13:55:32 -0700
Subject: [PATCH] Silly csum improvement. Maybe.

---
arch/x86/lib/csum-partial_64.c | 83 ++++++++++++++++------------------
1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index cea25ca8b8cf..d96e1da6604a 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -33,6 +33,20 @@ static inline __wsum csum_tail(u64 temp64, int odd)
return (__force __wsum)result;
}

+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]));
+ return sum;
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -59,52 +73,31 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
buff++;
}

- /*
- * len == 40 is the hot case due to IPv6 headers, but annotating it likely()
- * has noticeable negative affect on codegen for all other cases with
- * minimal performance benefit here.
- */
- if (len == 40) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r"(temp64)
- : [src] "r"(buff), "m"(*(const char(*)[40])buff));
- return csum_tail(temp64, odd);
- }
- if (unlikely(len >= 64)) {
- /*
- * Extra accumulators for better ILP in the loop.
- */
- u64 tmp_accum, tmp_carries;
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);

- asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
- "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
- "subl $64, %[len]\n\t"
- "1:\n\t"
- "addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq 4*8(%[src]),%[tmp_accum]\n\t"
- "adcq 5*8(%[src]),%[tmp_accum]\n\t"
- "adcq 6*8(%[src]),%[tmp_accum]\n\t"
- "adcq 7*8(%[src]),%[tmp_accum]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq $64, %[src]\n\t"
- "subl $64, %[len]\n\t"
- "jge 1b\n\t"
- "addq %[tmp_accum],%[res]\n\t"
- "adcq %[tmp_carries],%[res]\n\t"
- "adcq $0,%[res]"
- : [tmp_accum] "=&r"(tmp_accum),
- [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
- [len] "+r"(len), [src] "+r"(buff)
- : "m"(*(const char *)buff));
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (temp64): "r" (temp64_2));
+ }
+
+ /*
+ * len == 40 is the hot case due to IPv6 headers, so return
+ * early for that exact case without checking the tail bytes.
+ */
+ if (len >= 40) {
+ temp64 = update_csum_40b(temp64, buff);
+ len -= 40;
+ if (!len)
+ return csum_tail(temp64, odd);
+ buff += 40;
}

if (len & 32) {
--
2.41.0.203.ga4f2cd32bb.dirty