MMX based IP-checksumming patch, 2.1.105, RFC

Guilhem Lavaux (mingo@hal.cobaltmicro.com)
Sat, 13 Jun 1998 04:52:58 -0700 (PDT)


the attached patch implements MMX-based checksumming for csum_partial().
It's functional but not completed yet. I'm posting it here because maybe
someone out there has ideas how to prevent the quite expensive FPU_SAVE /
RESTORE operations somehow ... the routine itself basically clobbers only
2 MMX registers.

the core unrolled loop of the csum_partial() function does cold-cache
checksumming at a speed of 270 MB/sec on my PII box (stock function does
210 MB/sec), in-cache performance is 540 MB/sec vs. 350 MB/sec, but for
most RL MTU's the FPU save/restore operation eats a considerable amount of
the saved cycles ...

[btw, the unrolled part is quite nontrivial, it's not easy to do
checksumming with the carry-less MMX engine. I claim that this routine is
the fastest TCP checksumming routine physically possible on the PII ;)]

-- mingo

--- linux/arch/i386/lib/checksum.c.vanilla Fri Jun 12 03:45:43 1998
+++ linux/arch/i386/lib/checksum.c Fri Jun 12 05:35:34 1998
@@ -26,6 +26,132 @@
* computes a partial checksum, e.g. for TCP/UDP fragments
*/

+#define SAVE_FPU \
+ char fpu_save[108]; \
+ __asm__ __volatile__ ( " fsave %0;\n"::"m"(fpu_save[0]) )
+
+#define RESTORE_FPU \
+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) )
+
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+{
+ SAVE_FPU;
+
+ __asm__("
+ testl $2, %%esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %%ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %%ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%%esi), %%bx
+ addl $2, %%esi
+ addw %%bx, %%ax
+ adcl $0, %%eax
+2:
+ movl %%ecx, %%edx
+ shrl $5, %%ecx
+ jz 2f
+ testl %%esi, %%esi
+.align 16,0x90
+"
+#define STOCK 0
+#if STOCK
+"
+1: movl (%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 4(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 8(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 12(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 16(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 20(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 24(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 28(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ lea 32(%%esi), %%esi
+ dec %%ecx
+ jne 1b
+ adcl $0, %%eax
+"
+#else
+"
+ pxor %%mm1, %%mm1
+1:
+
+ movq %%mm1, %%mm3;
+ paddd (%%esi),%%mm1;
+ pcmpgtd %%mm1, %%mm3;
+ psubd %%mm3, %%mm1;
+
+ movq %%mm1, %%mm3;
+ paddd 8(%%esi),%%mm1;
+ pcmpgtd %%mm1, %%mm3;
+ psubd %%mm3, %%mm1;
+
+ movq %%mm1, %%mm3;
+ paddd 16(%%esi),%%mm1;
+ pcmpgtd %%mm1, %%mm3;
+ psubd %%mm3, %%mm1;
+
+ movq %%mm1, %%mm3;
+ paddd 24(%%esi),%%mm1;
+ pcmpgtd %%mm1, %%mm3;
+ psubd %%mm3, %%mm1;
+
+ leal 32(%%esi), %%esi
+ decl %%ecx
+ jne 1b
+
+ movd %%mm1, %%ecx
+ addl $0x80000000, %%ecx
+ psrlq $32, %%mm1
+ movd %%mm1, %%ebx
+ addl $0x80000000, %%ebx
+
+ clc
+ adcl %%ecx, %%eax
+ adcl %%ebx, %%eax
+ adcl $-1, %%eax
+"
+#endif
+"
+2: movl %%edx, %%ecx
+ andl $0x1c, %%edx
+ je 4f
+ shrl $2, %%edx
+ testl %%esi, %%esi
+3: adcl (%%esi), %%eax
+ lea 4(%%esi), %%esi
+ dec %%edx
+ jne 3b
+ adcl $0, %%eax
+4: andl $3, %%ecx
+ jz 7f
+ cmpl $2, %%ecx
+ jb 5f
+ movw (%%esi),%%cx
+ leal 2(%%esi),%%esi
+ je 6f
+ shll $16,%%ecx
+5: movb (%%esi),%%cl
+6: addl %%ecx,%%eax
+ adcl $0, %%eax
+7: "
+ : "=a"(sum)
+ : "0"(sum), "c"(len), "S"(buff)
+ : "bx", "cx", "dx", "si");
+
+ RESTORE_FPU;
+ return(sum);
+}
+
+#if 0
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) {
/*
* Experiments with ethernet and slip connections show that buff
@@ -96,6 +222,7 @@
: "bx", "cx", "dx", "si");
return(sum);
}
+#endif

/*
* Copy from ds while checksumming, otherwise like csum_partial

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu