kumon@flab.fujitsu.co.jp wrote:
>
> In the heavy duty case, csum_partial_copy_generic() becomes the new
> winner of the worst time consuming function with the poll()
> optimization. We are arranging the global figure now.
>
> Though csum_partial_copy_generic() is highly optimized with
> hand-crafted code, it eats lots of time. It may be inevitable, but may
> be reducible. We are now investigating why it does.
csum_partial_copy_generic() could certainly be more optimized;
attached is a snapshot of a version that does upto 20% better in
dumb benchmarks, if and what difference there is for real loads
i haven't yet measured.
[patch vs 2.3.99pre6pre5, offsets are wrong]
diff -urNp /img/linux-2.3.99pre6pre5/arch/i386/lib/checksum.S linux-2.3.99pre6pre5as/arch/i386/lib/checksum.S
--- /img/linux-2.3.99pre6pre5/arch/i386/lib/checksum.S Wed Mar 29 20:53:25 2000
+++ linux-2.3.99pre6pre5as/arch/i386/lib/checksum.S Sat Apr 22 10:43:28 2000
@@ -374,81 +373,119 @@ DST( movb %cl, (%edi) )
/* Version for PentiumII/PPro */
+/*
+ This is
+ o 70% slower when the source is not 32 bit aligned [ie (long)src&3]
+ o 190% slower when the destination is not 32 bit aligned
+ o 260% slower when both source and destination are not 32 bit aligned
+ o 175% slower when destination is not 64 bit aligned and source _is_ [ie (long)dst&4]
+ o whether source is 64 bit aligned or not does not seem to make much difference
+ */
+
#define ROUND1(x) \
- SRC(movl x(%esi), %ebx ) ; \
- addl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ addl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
#define ROUND(x) \
- SRC(movl x(%esi), %ebx ) ; \
- adcl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
+
+#define ROUNDL(x) \
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %edx, x(%edi) ) ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
#define ARGBASE 12
csum_partial_copy_generic:
pushl %ebx
- pushl %edi
+ movl ARGBASE+12-4*2(%esp),%ebx #len
pushl %esi
- movl ARGBASE+4(%esp),%esi #src
- movl ARGBASE+8(%esp),%edi #dst
- movl ARGBASE+12(%esp),%ecx #len
- movl ARGBASE+16(%esp),%eax #sum
- movl %ecx, %edx
- movl %ecx, %ebx
- shrl $6, %ecx
- andl $0x3c, %ebx
+ movl ARGBASE+4-4*1(%esp),%esi #src
+ movl %ebx, %ecx
+ pushl %edi
+ movl ARGBASE+8-4*0(%esp),%edi #dst
+ andl $0x38, %ebx
+ addl %ebx, %esi
+ shrl $6, %ecx # len /= 64 (number of longwords per iteration)
+ addl %ebx, %edi
negl %ebx
- subl %ebx, %esi
- subl %ebx, %edi
+ movl ARGBASE+16-4*0(%esp),%eax #sum
lea 3f(%ebx,%ebx), %ebx
- testl %esi, %esi
+ testl %eax,%eax # CF=0
jmp *%ebx
-1: addl $64,%esi
+1:
+ addl $64,%esi
addl $64,%edi
- ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
- ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
- ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
- ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
-3: adcl $0,%eax
+ ROUND1(-64) ROUND (-56)
+ ROUND (-48) ROUND (-40)
+ ROUND (-32) ROUND (-24)
+ ROUND (-16) ROUNDL(-8)
+3:
+ adcl $0,%eax
dec %ecx
jge 1b
-4: andl $3, %edx
+
+ movl ARGBASE+12(%esp),%edx #len
+
+ testl $4,%edx
+ jz 4f
+ SRC(movl (%esi), %ebx )
+ addl %ebx, %eax
+ DST(movl %ebx, (%edi) )
+ leal 4(%esi), %esi
+ leal 4(%edi), %edi
+ adcl $0, %eax
+4:
+ andl $3, %edx
jz 7f
cmpl $2, %edx
jb 5f
SRC( movw (%esi), %dx )
- leal 2(%esi), %esi
DST( movw %dx, (%edi) )
- leal 2(%edi), %edi
je 6f
+ leal 2(%esi), %esi
shll $16,%edx
+ leal 2(%edi), %edi
5:
SRC( movb (%esi), %dl )
DST( movb %dl, (%edi) )
-6: addl %edx, %eax
+6:
+ addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
+6001:
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
xorl %eax,%eax
+ movl $-EFAULT, (%ebx)
rep; stosb
- jmp 7b
+ jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous
- popl %esi
popl %edi
+ popl %esi
popl %ebx
ret
#undef ROUND
#undef ROUND1
-
+
#endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Sun Apr 30 2000 - 21:00:07 EST