Re: lockless poll() (was Re: namei() query)

From: Artur Skawina (skawina@geocities.com)
Date: Mon Apr 24 2000 - 08:50:31 EST


kumon@flab.fujitsu.co.jp wrote:
>
> In the heavy duty case, csum_partial_copy_generic() becomes the new
> winner of the worst time consuming function with the poll()
> optimization. We are arranging the global figure now.
>
> Though csum_partial_copy_generic() is highly optimized with
> hand-crafted code, it eats lots of time. It may be inevitable, but may
> be reducible. We are now investigating why it does.

csum_partial_copy_generic() could certainly be more optimized;
attached is a snapshot of a version that does upto 20% better in
dumb benchmarks, if and what difference there is for real loads
i haven't yet measured.

[patch vs 2.3.99pre6pre5, offsets are wrong]

diff -urNp /img/linux-2.3.99pre6pre5/arch/i386/lib/checksum.S linux-2.3.99pre6pre5as/arch/i386/lib/checksum.S
--- /img/linux-2.3.99pre6pre5/arch/i386/lib/checksum.S Wed Mar 29 20:53:25 2000
+++ linux-2.3.99pre6pre5as/arch/i386/lib/checksum.S Sat Apr 22 10:43:28 2000
@@ -374,81 +373,119 @@ DST( movb %cl, (%edi) )
 
 /* Version for PentiumII/PPro */
 
+/*
+ This is
+ o 70% slower when the source is not 32 bit aligned [ie (long)src&3]
+ o 190% slower when the destination is not 32 bit aligned
+ o 260% slower when both source and destination are not 32 bit aligned
+ o 175% slower when destination is not 64 bit aligned and source _is_ [ie (long)dst&4]
+ o whether source is 64 bit aligned or not does not seem to make much difference
+ */
+
 #define ROUND1(x) \
- SRC(movl x(%esi), %ebx ) ; \
- addl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ addl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
 
 #define ROUND(x) \
- SRC(movl x(%esi), %ebx ) ; \
- adcl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ DST(movl %edx, x(%edi) ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
+
+#define ROUNDL(x) \
+ SRC(movl x(%esi), %edx ) ;\
+ adcl %edx, %eax ;\
+ SRC(movl x+4(%esi), %ebx ) ;\
+ adcl %ebx, %eax ;\
+ DST(movl %edx, x(%edi) ) ;\
+ DST(movl %ebx, x+4(%edi) ) ;\
 
 #define ARGBASE 12
                 
 csum_partial_copy_generic:
         pushl %ebx
- pushl %edi
+ movl ARGBASE+12-4*2(%esp),%ebx #len
         pushl %esi
- movl ARGBASE+4(%esp),%esi #src
- movl ARGBASE+8(%esp),%edi #dst
- movl ARGBASE+12(%esp),%ecx #len
- movl ARGBASE+16(%esp),%eax #sum
- movl %ecx, %edx
- movl %ecx, %ebx
- shrl $6, %ecx
- andl $0x3c, %ebx
+ movl ARGBASE+4-4*1(%esp),%esi #src
+ movl %ebx, %ecx
+ pushl %edi
+ movl ARGBASE+8-4*0(%esp),%edi #dst
+ andl $0x38, %ebx
+ addl %ebx, %esi
+ shrl $6, %ecx # len /= 64 (number of longwords per iteration)
+ addl %ebx, %edi
         negl %ebx
- subl %ebx, %esi
- subl %ebx, %edi
+ movl ARGBASE+16-4*0(%esp),%eax #sum
         lea 3f(%ebx,%ebx), %ebx
- testl %esi, %esi
+ testl %eax,%eax # CF=0
         jmp *%ebx
-1: addl $64,%esi
+1:
+ addl $64,%esi
         addl $64,%edi
- ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
- ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
- ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
- ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
-3: adcl $0,%eax
+ ROUND1(-64) ROUND (-56)
+ ROUND (-48) ROUND (-40)
+ ROUND (-32) ROUND (-24)
+ ROUND (-16) ROUNDL(-8)
+3:
+ adcl $0,%eax
         dec %ecx
         jge 1b
-4: andl $3, %edx
+
+ movl ARGBASE+12(%esp),%edx #len
+
+ testl $4,%edx
+ jz 4f
+ SRC(movl (%esi), %ebx )
+ addl %ebx, %eax
+ DST(movl %ebx, (%edi) )
+ leal 4(%esi), %esi
+ leal 4(%edi), %edi
+ adcl $0, %eax
+4:
+ andl $3, %edx
         jz 7f
         cmpl $2, %edx
         jb 5f
 SRC( movw (%esi), %dx )
- leal 2(%esi), %esi
 DST( movw %dx, (%edi) )
- leal 2(%edi), %edi
         je 6f
+ leal 2(%esi), %esi
         shll $16,%edx
+ leal 2(%edi), %edi
 5:
 SRC( movb (%esi), %dl )
 DST( movb %dl, (%edi) )
-6: addl %edx, %eax
+6:
+ addl %edx, %eax
         adcl $0, %eax
 7:
 .section .fixup, "ax"
-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
+6001:
         # zero the complete destination (computing the rest is too much work)
         movl ARGBASE+8(%esp),%edi # dst
         movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
         xorl %eax,%eax
+ movl $-EFAULT, (%ebx)
         rep; stosb
- jmp 7b
+ jmp 7b
 6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
         movl $-EFAULT, (%ebx)
         jmp 7b
 .previous
 
- popl %esi
         popl %edi
+ popl %esi
         popl %ebx
         ret
                                 
 #undef ROUND
 #undef ROUND1
-
+
 #endif

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Apr 30 2000 - 21:00:07 EST