Re: [GIT PULL] x86/misc for 6.5

From: Linus Torvalds
Date: Tue Jun 27 2023 - 16:49:40 EST


On Tue, 27 Jun 2023 at 13:38, Borislav Petkov <bp@xxxxxxxxx> wrote:
>
> And there's a third kind who relax by the pool with a nice drink,
> *while* playing around with inline asm. ;-P

That explains a lot.

> Btw, I'll send you a new version of this pull request with this patch
> dropped to let folks experiment with it more.

Oh, I already merged it. I don't hate the change, I just looked at it
and went "I would have done that differently" and started playing
around with it.

There's nothing hugely *wrong* with the code I merged, but I do think
that it did too much inside the inline asm (ie looping inside the asm,
but also initializing values that could have - and should have - just
been given as inputs to the asm).

And the whole "why have two different versions for 40-byte and 64-byte
areas, when you _could_ just do it with one 40-byte one that you then
also just unroll".

So I _think_ my version is nicer and shorter - assuming it works and
there are no other bugs than the one I already noticed - but I don't
think it's a huge deal.

Anyway, before I throw my patch away, I'll just post it with the
trivial fixes to use "+r", and with the "volatile" removed (I add
"volatile" to asms by habit, but this one really isn't volatile).

I just checked that both gcc and clang seem to be happy with it, but
that's the only testing this patch has gotten: it compiles for me.

Do with it what you will.

Linus
arch/x86/lib/csum-partial_64.c | 66 ++++++++++++++++++------------------------
1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index cea25ca8b8cf..61e8c3d97c04 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -33,6 +33,20 @@ static inline __wsum csum_tail(u64 temp64, int odd)
return (__force __wsum)result;
}

+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
+{
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]));
+ return sum;
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -64,47 +78,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
* has noticeable negative affect on codegen for all other cases with
* minimal performance benefit here.
*/
- if (len == 40) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r"(temp64)
- : [src] "r"(buff), "m"(*(const char(*)[40])buff));
+ if (likely(len == 40)) {
+ temp64 = update_csum_40b(temp64, buff);
return csum_tail(temp64, odd);
}
- if (unlikely(len >= 64)) {
- /*
- * Extra accumulators for better ILP in the loop.
- */
- u64 tmp_accum, tmp_carries;

- asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
- "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
- "subl $64, %[len]\n\t"
- "1:\n\t"
- "addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq 4*8(%[src]),%[tmp_accum]\n\t"
- "adcq 5*8(%[src]),%[tmp_accum]\n\t"
- "adcq 6*8(%[src]),%[tmp_accum]\n\t"
- "adcq 7*8(%[src]),%[tmp_accum]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq $64, %[src]\n\t"
- "subl $64, %[len]\n\t"
- "jge 1b\n\t"
- "addq %[tmp_accum],%[res]\n\t"
- "adcq %[tmp_carries],%[res]\n\t"
- "adcq $0,%[res]"
- : [tmp_accum] "=&r"(tmp_accum),
- [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
- [len] "+r"(len), [src] "+r"(buff)
- : "m"(*(const char *)buff));
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (temp64): "r" (temp64_2));
}

if (len & 32) {