[PATCH] ip checksum fixes and optimizations

Artur Skawina (skawina@geocities.com)
Fri, 10 Sep 1999 05:41:03 +0200


This is a multi-part message in MIME format.

--------------3219EB5E33F6067B37ED4058
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Some time ago i looked at the ip checksum routines and found a lot of
places that could be improved. This patch constains the nonintrusive,
selfcontained parts. aot the checksum routines were sometimes called
with rediculously small args, already known information was discarded,
the asm code was not optimized fully for modern processors etc.
I just ported this to 2.3.17, and as i may not have the time to work
on this in the next few days I'm posting it as is, only to get some
feedback. Thoughts?

Parts of this are ~100% correct, but there are also things i noticed
while working on something else, and have still to recheck, some
strange compiler issues not fully investigated etc. The checksum.S
parts are ok, but they (apparently;) need a lot more comments. The
[345]86 routines are unchanged.

Highlights:
o csum_partial_copy() backward compatibility stub removed
o csum_partial() inlined for very small constant sizes (like struct udphdr)
o ip_fast_csum() - various optimization
o csum_tcpudp_nofold() optimized for sum==0 case
o 686 csum_partial() optimized (up to 13%..1% faster)
(ie 13% for 32 bytes, but only 1% for 1480)
o 686 csum_partial_copy_generic() optimized (14%..20% faster)
o incremental checksum calculation fix

--------------3219EB5E33F6067B37ED4058
Content-Type: text/plain; charset=us-ascii; name="patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="patch"

diff -urNp /img/linux-2.3.17/arch/i386/kernel/i386_ksyms.c linux-2.3.17as/arch/i386/kernel/i386_ksyms.c
--- /img/linux-2.3.17/arch/i386/kernel/i386_ksyms.c Wed Sep 1 20:33:56 1999
+++ linux-2.3.17as/arch/i386/kernel/i386_ksyms.c Wed Sep 8 22:48:18 1999
@@ -47,7 +47,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter
EXPORT_SYMBOL_NOVERS(__down_failed_trylock);
EXPORT_SYMBOL_NOVERS(__up_wakeup);
/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy);
EXPORT_SYMBOL(csum_partial_copy_generic);
/* Delay loops */
EXPORT_SYMBOL(__udelay);
diff -urNp /img/linux-2.3.17/arch/i386/lib/Makefile linux-2.3.17as/arch/i386/lib/Makefile
--- /img/linux-2.3.17/arch/i386/lib/Makefile Fri Aug 27 13:13:11 1999
+++ linux-2.3.17as/arch/i386/lib/Makefile Wed Sep 8 17:45:06 1999
@@ -6,7 +6,7 @@
$(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $*.o

L_TARGET = lib.a
-L_OBJS = checksum.o old-checksum.o delay.o \
+L_OBJS = checksum.o delay.o \
usercopy.o getuser.o putuser.o

include $(TOPDIR)/Rules.make
diff -urNp /img/linux-2.3.17/arch/i386/lib/checksum.S linux-2.3.17as/arch/i386/lib/checksum.S
--- /img/linux-2.3.17/arch/i386/lib/checksum.S Thu May 20 01:59:04 1999
+++ linux-2.3.17as/arch/i386/lib/checksum.S Fri Sep 10 02:50:45 1999
@@ -36,8 +36,8 @@ unsigned int csum_partial(const unsigned
*/

.text
-.align 4
-.globl csum_partial
+.align 32
+.globl _csum_partial

#if CPU!=686

@@ -48,7 +48,7 @@ unsigned int csum_partial(const unsigned
* Fortunately, it is easy to convert 2-byte alignment to 4-byte
* alignment for the unrolled loop.
*/
-csum_partial:
+_csum_partial:
pushl %esi
pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
@@ -116,47 +116,29 @@ csum_partial:

#else /* CPU==686 */

-csum_partial:
+_csum_partial:
+ movl 16-4*2(%esp),%edx # Function arg2: int len
pushl %esi
+ movl 12-4*1(%esp),%esi # Function arg1: const unsigned char *buf
pushl %ebx
- movl 20(%esp),%eax # Function arg: unsigned int sum
- movl 16(%esp),%ecx # Function arg: int len
- movl 12(%esp),%esi # Function arg: const unsigned char *buf
-
- testl $2, %esi
+ movl 20-4*0(%esp),%eax # Function arg3: unsigned int sum
+ testl $2, %esi
jnz 30f
10:
- movl %ecx, %edx
- movl %ecx, %ebx
+ movl %edx, %ebx
+ movl %edx, %ecx
andl $0x7c, %ebx
- shrl $7, %ecx
+ shrl $7, %edx
addl %ebx,%esi
shrl $2, %ebx
negl %ebx
+ testl %edx,%edx
lea 45f(%ebx,%ebx,2), %ebx
- testl %esi, %esi
jmp *%ebx

- # Handle 2-byte-aligned regions
-20: addw (%esi), %ax
- lea 2(%esi), %esi
- adcl $0, %eax
- jmp 10b
-
-30: subl $2, %ecx
- ja 20b
- je 32f
- movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
- addl %ebx, %eax
- adcl $0, %eax
- jmp 80f
-32:
- addw (%esi), %ax # csumming 2 bytes, 2-aligned
- adcl $0, %eax
- jmp 80f
-
40:
- addl -128(%esi), %eax
+ leal 128(%esi), %esi # %esi += 128
+ addl -128(%esi), %eax # CF==0 and a plain 'add' is faster
adcl -124(%esi), %eax
adcl -120(%esi), %eax
adcl -116(%esi), %eax
@@ -189,27 +171,44 @@ csum_partial:
adcl -8(%esi), %eax
adcl -4(%esi), %eax
45:
- lea 128(%esi), %esi
- adcl $0, %eax
- dec %ecx
+ adcl $0, %eax # %eax += CF
+ dec %edx
jge 40b
- movl %edx, %ecx
-50: andl $3, %ecx
- jz 80f

- # Handle the last 1-3 bytes without jumping
- notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
- movl $0xffffff,%ebx # by the shll and shrl instructions
- shll $3,%ecx
- shrl %cl,%ebx
- andl -128(%esi),%ebx # esi is 4-aligned so should be ok
- addl %ebx,%eax
- adcl $0,%eax
+ andl $3, %ecx
+ jz 80f
+ # Handle the last 1-3 bytes without jumping
+ notb %cl # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%edx # by the shll and shrl instructions
+ shlb $3,%cl
+ shrl %cl,%edx # note: arg1 must be CL
+ andl (%esi),%edx # esi is 4-aligned so should be ok
+ addl %edx,%eax
+75:
+ adcl $0,%eax # %eax += CF
80:
- popl %ebx
- popl %esi
+ popl %ebx
+ popl %esi
ret
-
+
+ # Handle 2-byte-aligned regions
+20:
+ addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+
+30:
+ subl $2, %edx
+ ja 20b
+ js 35f
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ jmp 75b
+
+35:
+ movzbl (%esi),%ecx # csumming 1 byte, 2-aligned
+ addl %ecx, %eax
+ jmp 75b
#endif /* CPU==686 */

/*
@@ -240,13 +239,13 @@ unsigned int csum_partial_copy_generic (
.long 9999b, 6002f ; \
.previous

-.align 4
+.align 32
.globl csum_partial_copy_generic

#if CPU!=686

#define ARGBASE 16
-#define FP 12
+#define FP 12

csum_partial_copy_generic:
subl $4,%esp
@@ -371,81 +370,119 @@ DST( movb %cl, (%edi) )

/* Version for PentiumII/PPro */

+/*
+ This is
+ o 70% slower when the source is not 32 bit aligned [ie (long)src&3]
+ o 190% slower when the destination is not 32 bit aligned
+ o 260% slower when both source and destination are not 32 bit aligned
+ o 175% slower when destination is not 64 bit aligned and source _is_ [ie (long)dst&4]
+ o whether source is 64 bit aligned or not does not seem to make much difference
+ */
+
#define ROUND1(x) \
- SRC(movl x(%esi), %ebx ) ; \
- addl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ addl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\

#define ROUND(x) \
- SRC(movl x(%esi), %ebx ) ; \
- adcl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
+
+#define ROUNDL(x) \
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %edx, x(%edi) ) ;\
+ DST(movl %ebx, x+4(%edi) ) ;\

#define ARGBASE 12

csum_partial_copy_generic:
pushl %ebx
- pushl %edi
+ movl ARGBASE+12-4*2(%esp),%ebx #len
pushl %esi
- movl ARGBASE+4(%esp),%esi #src
- movl ARGBASE+8(%esp),%edi #dst
- movl ARGBASE+12(%esp),%ecx #len
- movl ARGBASE+16(%esp),%eax #sum
- movl %ecx, %edx
- movl %ecx, %ebx
- shrl $6, %ecx
- andl $0x3c, %ebx
+ movl ARGBASE+4-4*1(%esp),%esi #src
+ movl %ebx, %ecx
+ pushl %edi
+ movl ARGBASE+8-4*0(%esp),%edi #dst
+ andl $0x38, %ebx
+ addl %ebx, %esi
+ shrl $6, %ecx # len /= 64 (amount of longwords per iteration)
+ addl %ebx, %edi
negl %ebx
- subl %ebx, %esi
- subl %ebx, %edi
+ movl ARGBASE+16-4*0(%esp),%eax #sum
lea 3f(%ebx,%ebx), %ebx
- testl %esi, %esi
+ testl %eax,%eax # CF=0
jmp *%ebx
-1: addl $64,%esi
+1:
+ addl $64,%esi
addl $64,%edi
- ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
- ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
- ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
- ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
-3: adcl $0,%eax
+ ROUND1(-64) ROUND (-56)
+ ROUND (-48) ROUND (-40)
+ ROUND (-32) ROUND (-24)
+ ROUND (-16) ROUNDL(-8)
+3:
+ adcl $0,%eax
dec %ecx
jge 1b
-4: andl $3, %edx
+
+ movl ARGBASE+12(%esp),%edx #len
+
+ testl $4,%edx
+ jz 4f
+ SRC(movl (%esi), %ebx )
+ addl %ebx, %eax
+ DST(movl %ebx, (%edi) )
+ leal 4(%esi), %esi
+ leal 4(%edi), %edi
+ adcl $0, %eax
+4:
+ andl $3, %edx
jz 7f
cmpl $2, %edx
jb 5f
SRC( movw (%esi), %dx )
- leal 2(%esi), %esi
DST( movw %dx, (%edi) )
- leal 2(%edi), %edi
je 6f
+ leal 2(%esi), %esi
shll $16,%edx
+ leal 2(%edi), %edi
5:
SRC( movb (%esi), %dl )
DST( movb %dl, (%edi) )
-6: addl %edx, %eax
+6:
+ addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
+6001:
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
xorl %eax,%eax
+ movl $-EFAULT, (%ebx)
rep; stosb
- jmp 7b
+ jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous

- popl %esi
popl %edi
+ popl %esi
popl %ebx
ret

#undef ROUND
#undef ROUND1
-
+
#endif /* CPU==i686 */
diff -urNp /img/linux-2.3.17/arch/i386/lib/old-checksum.c linux-2.3.17as/arch/i386/lib/old-checksum.c
--- /img/linux-2.3.17/arch/i386/lib/old-checksum.c Sun Dec 27 18:32:09 1998
+++ linux-2.3.17as/arch/i386/lib/old-checksum.c Thu Jan 1 00:00:00 1970
@@ -1,19 +0,0 @@
-/*
- * FIXME: old compatibility stuff, will be removed soon.
- */
-
-#include <net/checksum.h>
-
-unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum)
-{
- int src_err=0, dst_err=0;
-
- sum = csum_partial_copy_generic ( src, dst, len, sum, &src_err, &dst_err);
-
- if (src_err || dst_err)
- printk("old csum_partial_copy_fromuser(), tell mingo to convert me.\n");
-
- return sum;
-}
-
-
diff -urNp /img/linux-2.3.17/drivers/net/ibmtr.c linux-2.3.17as/drivers/net/ibmtr.c
--- /img/linux-2.3.17/drivers/net/ibmtr.c Fri Aug 27 13:13:00 1999
+++ linux-2.3.17as/drivers/net/ibmtr.c Wed Sep 8 17:34:37 1999
@@ -1598,7 +1598,7 @@ static void tr_rx(struct net_device *dev
/* Copy the payload... */
for (;;) {
if (IPv4_p)
- chksum = csum_partial_copy(bus_to_virt(rbufdata), data,
+ chksum = csum_partial_copy_nocheck(bus_to_virt(rbufdata), data,
length < rbuffer_len ? length : rbuffer_len,
chksum);
else
diff -urNp /img/linux-2.3.17/include/asm-i386/checksum.h linux-2.3.17as/include/asm-i386/checksum.h
--- /img/linux-2.3.17/include/asm-i386/checksum.h Sun Dec 27 18:39:50 1998
+++ linux-2.3.17as/include/asm-i386/checksum.h Wed Sep 8 17:34:37 1999
@@ -14,7 +14,40 @@
*
* it's best to have buff aligned on a 32-bit boundary
*/
-asmlinkage unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum);
+
+#define csum_partial(buff,len,sum) \
+( \
+ __builtin_constant_p(len) ? \
+ _csum_partial_clen((buff),(len),(sum)) : \
+ _csum_partial((buff),(len),(sum)) \
+)
+asmlinkage unsigned int _csum_partial(const unsigned char * buff, int len, unsigned int sum);
+
+extern inline unsigned int _csum_partial_clen(const unsigned char *buff, int len, unsigned int sum)
+{
+ switch ( len )
+ {
+ case 4:
+ asm (
+ "addl (%1), %0\n\t"
+ "adcl $0, %0" /* += CF */
+ : "=&r" (sum)
+ : "r" (buff), "0" (sum)
+ );
+ return sum;
+ case 8: /* eg sizeof(struct udphdr) */
+ asm (
+ "addl (%1), %0\n\t"
+ "adcl 4(%1), %0\n\t"
+ "adcl $0, %0" /* += CF */
+ : "=&r" (sum)
+ : "r" (buff), "0" (sum)
+ );
+ return sum;
+ default:
+ return _csum_partial(buff,len,sum);
+ }
+}

/*
* the same as csum_partial, but copies from src while it
@@ -23,10 +56,12 @@ asmlinkage unsigned int csum_partial(con
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
-
+/* automatically inlining this routine isn't as obviously a win, as with checksum
+ alone (exceptions must be handled, and currently (2.3.6). only icmp code would
+ benefit from this anyway)
+ */
asmlinkage unsigned int csum_partial_copy_generic( const char *src, char *dst, int len, int sum,
int *src_err_ptr, int *dst_err_ptr);
-
/*
* Note: when you get a NULL pointer exception here this means someone
* passed in an incorrect kernel address to one of these functions.
@@ -34,22 +69,13 @@ asmlinkage unsigned int csum_partial_cop
* If you use these functions directly please don't forget the
* verify_area().
*/
-extern __inline__
-unsigned int csum_partial_copy_nocheck ( const char *src, char *dst,
- int len, int sum)
-{
- return csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL);
-}
-
-extern __inline__
-unsigned int csum_partial_copy_from_user ( const char *src, char *dst,
- int len, int sum, int *err_ptr)
-{
- return csum_partial_copy_generic ( src, dst, len, sum, err_ptr, NULL);
-}
+#define csum_partial_copy_nocheck(src,dst,len,sum) \
+ csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL)
+
+#define csum_partial_copy_from_user(src,dst,len,sum,err_ptr) \
+ csum_partial_copy_generic((src), (dst), (len), (sum), (err_ptr), NULL)

#if 0
-
/* Not used at the moment. It is difficult to imagine for what purpose
it can be used :-) Please, do not forget to verify_area before it --ANK
*/
@@ -57,32 +83,18 @@ unsigned int csum_partial_copy_from_user
/*
* This combination is currently not used, but possible:
*/
-
-extern __inline__
-unsigned int csum_partial_copy_to_user ( const char *src, char *dst,
- int len, int sum, int *err_ptr)
-{
- return csum_partial_copy_generic ( src, dst, len, sum, NULL, err_ptr);
-}
+#define csum_partial_copy_to_user(src,dst,len,sum,err_ptr) \
+ csum_partial_copy_generic((src), (dst), (len), (sum), NULL, (err_ptr))
#endif

/*
- * These are the old (and unsafe) way of doing checksums, a warning message will be
- * printed if they are used and an exeption occurs.
- *
- * these functions should go away after some time.
- */
-
-#define csum_partial_copy_fromuser csum_partial_copy
-unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum);
-
-/*
* This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries.
*
* By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
* Arnt Gulbrandsen.
*/
+#if 0
static inline unsigned short ip_fast_csum(unsigned char * iph,
unsigned int ihl) {
unsigned int sum;
@@ -109,16 +121,74 @@ static inline unsigned short ip_fast_csu
/* Since the input registers which are loaded with iph and ipl
are modified, we must also specify them as outputs, or gcc
will assume they contain their original values. */
- : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "=&r" (sum), "=r" (iph), "=r" (ihl)
: "1" (iph), "2" (ihl));
return(sum);
}
+#else
+ /* ihl is the number of 32-bit words and is always >= 5. */
+#define ip_fast_csum(iph,ihl) csum_fold(ip_fast_csum_nofold((iph),(ihl)))
+
+#define ip_fast_csum_nofold(iph,ihl) \
+( \
+ __builtin_constant_p(ihl) ? \
+ _ip_fast_csum_nofold_cihl((iph),(ihl)) : \
+ _ip_fast_csum_nofold((iph),(ihl)) \
+)
+extern inline unsigned int _ip_fast_csum_nofold(unsigned char *iph, unsigned int ihl)
+{
+ __u32 sum;
+
+ __asm__ __volatile__(
+ "movl (%1), %0\n\t"
+ "addl 4(%1), %0\n\t"
+ "adcl 8(%1), %0\n\t"
+ "adcl 12(%1), %0\n"
+ "1:\n\t"
+ "adcl 16(%1), %0\n\t"
+ "decl %2\n\t"
+ "lea 4(%1), %1\n\t"
+ "jne 1b\n\t"
+ "adcl $0, %0" /* += CF */
+
+ /* Since the input registers which are loaded with iph and ipl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=&r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl-4));
+
+ return sum;
+}
+extern inline unsigned int _ip_fast_csum_nofold_cihl(unsigned char *iph, unsigned int ihl)
+{
+ __u32 sum;
+
+ switch ( ihl ) /* "ihl" is a constant - pick the best asm sequence */
+ {
+ case 5:
+ __asm__ __volatile__ (
+ "movl (%1), %0\n\t"
+ "addl 4(%1), %0\n\t"
+ "adcl 8(%1), %0\n\t"
+ "adcl 12(%1), %0\n\t"
+ "adcl 16(%1), %0\n\t"
+ "adcl $0, %0" /* += CF */
+
+ : "=&r" (sum)
+ : "r" (iph) );
+
+ return sum;
+ default:
+ return _ip_fast_csum_nofold(iph,ihl); /* never happens */
+ }
+}
+#endif

/*
* Fold a partial checksum
*/

-static inline unsigned int csum_fold(unsigned int sum)
+static inline __u16 csum_fold(unsigned int sum)
{
__asm__("
addl %1, %0
@@ -130,7 +200,14 @@ static inline unsigned int csum_fold(uns
return (~sum) >> 16;
}

-static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
+ /* This is often called with a const "sum"==0 */
+#define csum_tcpudp_nofold(saddr,daddr,len,proto,sum) \
+( \
+ __builtin_constant_p(sum) ? \
+ _csum_tcpudp_nofold_csum((saddr),(daddr),(len),(proto),(sum)) : \
+ _csum_tcpudp_nofold((saddr),(daddr),(len),(proto),(sum)) \
+)
+static inline unsigned long _csum_tcpudp_nofold(unsigned long saddr,
unsigned long daddr,
unsigned short len,
unsigned short proto,
@@ -142,32 +219,47 @@ static inline unsigned long csum_tcpudp_
adcl %3, %0
adcl $0, %0
"
- : "=r" (sum)
- : "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum));
+ : "=&r" (sum)
+ : "g" (daddr), "g"(saddr), "g"((htons(len)<<16)+proto*256), "0"(sum));
return sum;
}
+static inline unsigned long _csum_tcpudp_nofold_csum( unsigned long saddr,
+ unsigned long daddr,
+ unsigned short len,
+ unsigned short proto,
+ unsigned int sum )
+{
+ switch ( sum ) /* "sum" is a constant - pick the best asm sequence */
+ {
+ case 0: /* sum==0 is special:
+ * - no need to use "sum" at all
+ * - no need to clear the register - just start with another arg
+ */
+ __asm__(
+ "addl %1, %0\n\t"
+ "adcl %2, %0\n\t"
+ "adcl $0, %0"
+ : "=&r" (sum)
+ : "g" (daddr), "g"(saddr), "0"((htons(len)<<16)+proto*256) );
+ return sum;
+
+ default: /* not optimized - never happens */
+ return _csum_tcpudp_nofold(saddr,daddr,len,proto,sum);
+ }
+}

/*
* computes the checksum of the TCP/UDP pseudo-header
* returns a 16-bit checksum, already complemented
*/
-static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,
- unsigned long daddr,
- unsigned short len,
- unsigned short proto,
- unsigned int sum)
-{
- return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
-}
-
+#define csum_tcpudp_magic(saddr,daddr,len,proto,sum) \
+ csum_fold(csum_tcpudp_nofold((saddr),(daddr),(len),(proto),(sum)))
+
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
-
-static inline unsigned short ip_compute_csum(unsigned char * buff, int len) {
- return csum_fold (csum_partial(buff, len, 0));
-}
+#define ip_compute_csum(buff,len) csum_fold(csum_partial((buff), (len), 0))

#define _HAVE_ARCH_IPV6_CSUM
static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr,
@@ -191,13 +283,14 @@ static __inline__ unsigned short int csu
"
: "=&r" (sum)
: "r" (saddr), "r" (daddr),
- "r"(htonl((__u32) (len))), "r"(htonl(proto)), "0"(sum));
+ "g"(htonl((__u32) (len))), "g"(htonl(proto)), "0"(sum));

return csum_fold(sum);
}

/*
* Copy and checksum to user
+ * (this is currently never called with a constant len - AS)
*/
#define HAVE_CSUM_COPY_USER
static __inline__ unsigned int csum_and_copy_to_user (const char *src, char *dst,
diff -urNp /img/linux-2.3.17/include/net/ip.h linux-2.3.17as/include/net/ip.h
--- /img/linux-2.3.17/include/net/ip.h Wed Sep 1 20:44:32 1999
+++ linux-2.3.17as/include/net/ip.h Fri Sep 10 02:55:57 1999
@@ -13,6 +13,8 @@
*
* Changes:
* Mike McLagan : Routing by source
+ * 19990714 Artur Skawina <skawina@geocities.com> :
+ * Fixed incremental IP checksum calculation (ip_decrease_ttl)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -148,11 +150,18 @@ extern __inline__ int ip_send(struct sk_
extern __inline__
int ip_decrease_ttl(struct iphdr *iph)
{
- u16 check = iph->check;
- check = ntohs(check) + 0x0100;
- if ((check & 0xFF00) == 0)
- check++; /* carry overflow */
- iph->check = htons(check);
+/* iph->check contains the complement of the sum, in one's complement
+ arithmetic. We decrease one of the values (ttl), so we also have to
+ decrement the sum (not the complement).
+ Note that there are two different representations of zero, of which only
+ one ["-0"] is valid here. This means that eg. 0xfeff [~(0x0100)] must
+ become 0x0000 [~(-0)], not 0xffff [~(+0)]. /AS
+ */
+ u32 check = (u32)ntohs(iph->check) + 0x0100;
+ if (check >= 0xffff) /* check for overflow (incl. +0) */
+ check++;
+ iph->check = htons((u16)check);
+
return --iph->ttl;
}

diff -urNp /img/linux-2.3.17/net/ethernet/eth.c linux-2.3.17as/net/ethernet/eth.c
--- /img/linux-2.3.17/net/ethernet/eth.c Fri Aug 27 13:13:03 1999
+++ linux-2.3.17as/net/ethernet/eth.c Wed Sep 8 17:34:37 1999
@@ -298,7 +298,7 @@ void eth_copy_and_sum(struct sk_buff *de
if ((ip_length <= length) && (ip_length > 7))
length=ip_length;

- dest->csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base);
+ dest->csum=csum_partial_copy_nocheck(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base);
dest->ip_summed=1;
}

diff -urNp /img/linux-2.3.17/net/ipv4/icmp.c linux-2.3.17as/net/ipv4/icmp.c
--- /img/linux-2.3.17/net/ipv4/icmp.c Wed Sep 1 20:34:30 1999
+++ linux-2.3.17as/net/ipv4/icmp.c Wed Sep 8 17:34:37 1999
@@ -483,7 +483,7 @@ static int icmp_glue_bits(const void *p,
unsigned long csum;

if (offset) {
- icmp_param->csum=csum_partial_copy(icmp_param->data_ptr+offset-sizeof(struct icmphdr),
+ icmp_param->csum=csum_partial_copy_nocheck(icmp_param->data_ptr+offset-sizeof(struct icmphdr),
to, fraglen,icmp_param->csum);
return 0;
}
@@ -493,10 +493,10 @@ static int icmp_glue_bits(const void *p,
* the other fragments first, so that we get the checksum
* for the whole packet here.
*/
- csum = csum_partial_copy((void *)&icmp_param->icmph,
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->icmph,
to, sizeof(struct icmphdr),
icmp_param->csum);
- csum = csum_partial_copy(icmp_param->data_ptr,
+ csum = csum_partial_copy_nocheck(icmp_param->data_ptr,
to+sizeof(struct icmphdr),
fraglen-sizeof(struct icmphdr), csum);
icmph=(struct icmphdr *)to;
diff -urNp /img/linux-2.3.17/net/ipv4/ip_output.c linux-2.3.17as/net/ipv4/ip_output.c
--- /img/linux-2.3.17/net/ipv4/ip_output.c Wed Sep 1 20:34:30 1999
+++ linux-2.3.17as/net/ipv4/ip_output.c Wed Sep 8 17:34:37 1999
@@ -733,8 +733,15 @@ int ip_build_xmit(struct sock *sk,
iph->saddr=rt->rt_src;
iph->daddr=rt->rt_dst;
iph->check=0;
+#if 1
+ /* ugh - egcs 1.0 and 1.1 miscompiles this.
+ egcs 2.95 19990615 prerelease seems to do ok */
+ iph->check = ip_fast_csum((unsigned char *)iph, /*iph->ihl*/5);
+ err = getfrag(frag, ((char *)iph)+/*iph->ihl*/5*4,0, length-/*iph->ihl*/5*4);
+#else
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
+#endif
}
else
err = getfrag(frag, (void *)iph, 0, length);
diff -urNp /img/linux-2.3.17/net/ipv4/ipconfig.c linux-2.3.17as/net/ipv4/ipconfig.c
--- /img/linux-2.3.17/net/ipv4/ipconfig.c Fri Aug 27 13:13:15 1999
+++ linux-2.3.17as/net/ipv4/ipconfig.c Wed Sep 8 17:34:37 1999
@@ -524,7 +524,7 @@ static void __init ic_bootp_send_if(stru
h->ttl = 64;
h->protocol = IPPROTO_UDP;
h->daddr = INADDR_BROADCAST;
- h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+ h->check = ip_fast_csum((unsigned char *) h, /*h->ihl*/5);

/* Construct UDP header */
b->udph.source = htons(68);
@@ -635,7 +635,7 @@ static int __init ic_bootp_recv(struct s
skb->len < sizeof(struct udphdr) + sizeof(struct iphdr) ||
h->ihl != 5 ||
h->version != 4 ||
- ip_fast_csum((char *) h, h->ihl) != 0 ||
+ ip_fast_csum((char *) h, /*h->ihl*/5) != 0 ||
skb->len < ntohs(h->tot_len) ||
h->protocol != IPPROTO_UDP ||
b->udph.source != htons(67) ||
diff -urNp /img/linux-2.3.17/net/ipv4/tcp_output.c linux-2.3.17as/net/ipv4/tcp_output.c
--- /img/linux-2.3.17/net/ipv4/tcp_output.c Fri Aug 27 13:13:15 1999
+++ linux-2.3.17as/net/ipv4/tcp_output.c Wed Sep 8 17:34:37 1999
@@ -276,7 +276,7 @@ static int tcp_fragment(struct sock *sk,
TCP_SKB_CB(buff)->sacked = 0;

/* Copy and checksum data tail into the new buffer. */
- buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
+ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
nsize, 0);

/* This takes care of the FIN sequence number too. */
@@ -542,7 +542,7 @@ static void tcp_retrans_try_collapse(str
/* Optimize, actually we could also combine next_skb->csum
* to skb->csum using a single add w/carry operation too.
*/
- skb->csum = csum_partial_copy(next_skb->data,
+ skb->csum = csum_partial_copy_nocheck(next_skb->data,
skb_put(skb, next_skb_size),
next_skb_size, skb->csum);
}
diff -urNp /img/linux-2.3.17/net/ipv6/icmp.c linux-2.3.17as/net/ipv6/icmp.c
--- /img/linux-2.3.17/net/ipv6/icmp.c Fri Aug 27 13:13:15 1999
+++ linux-2.3.17as/net/ipv6/icmp.c Wed Sep 8 17:34:37 1999
@@ -146,17 +146,17 @@ static int icmpv6_getfrag(const void *da
*/

if (offset) {
- csum = csum_partial_copy((void *) msg->data +
+ csum = csum_partial_copy_nocheck((void *) msg->data +
offset - sizeof(struct icmp6hdr),
buff, len, msg->csum);
msg->csum = csum;
return 0;
}

- csum = csum_partial_copy((void *) &msg->icmph, buff,
+ csum = csum_partial_copy_nocheck((void *) &msg->icmph, buff,
sizeof(struct icmp6hdr), msg->csum);

- csum = csum_partial_copy((void *) msg->data,
+ csum = csum_partial_copy_nocheck((void *) msg->data,
buff + sizeof(struct icmp6hdr),
len - sizeof(struct icmp6hdr), csum);

diff -urNp /img/linux-2.3.17/net/netsyms.c linux-2.3.17as/net/netsyms.c
--- /img/linux-2.3.17/net/netsyms.c Wed Sep 1 20:34:33 1999
+++ linux-2.3.17as/net/netsyms.c Wed Sep 8 17:34:37 1999
@@ -200,7 +200,11 @@ EXPORT_SYMBOL(scm_fp_dup);
EXPORT_SYMBOL(max_files);
EXPORT_SYMBOL(do_mknod);
EXPORT_SYMBOL(memcpy_toiovec);
+#ifndef csum_partial
EXPORT_SYMBOL(csum_partial);
+#else
+EXPORT_SYMBOL(_csum_partial);
+#endif

#ifdef CONFIG_IPX_MODULE
EXPORT_SYMBOL(make_8023_client);

--------------3219EB5E33F6067B37ED4058--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/