Re: [PATCH RFC] x86:Improve memset with general 64bit instruction

From: Ling Ma
Date: Mon Apr 07 2014 - 10:53:03 EST


Append test suit
after tar, run ./test command please.

thanks

2014-04-07 22:50 GMT+08:00, ling.ma.program@xxxxxxxxx
<ling.ma.program@xxxxxxxxx>:
> From: Ling Ma <ling.ml@xxxxxxxxxxxxxxx>
>
> In this patch we manage to reduce miss branch prediction by
> avoiding using branch instructions and force destination to be aligned
> with general 64bit instruction.
> Below compared results shows we improve performance up to 1.8x
> (We modified test suit from Ondra, send after this patch)
>
> Bytes: ORG_TIME: NEW_TIME: ORG vs NEW:
> 7 0.51 0.48 1.06
> 16 0.55 0.38 1.44
> 18 0.61 0.44 1.38
> 21 0.62 0.47 1.31
> 25 0.64 0.45 1.42
> 30 0.65 0.45 1.44
> 36 0.66 0.44 1.50
> 38 0.67 0.46 1.45
> 62 0.70 0.44 1.59
> 75 0.71 0.44 1.61
> 85 0.73 0.46 1.58
> 120 0.78 0.44 1.77
> 193 0.81 0.46 1.76
> 245 0.84 0.52 1.61
> 256 0.83 0.45 1.84
> 356 0.86 0.55 1.56
> 601 0.98 0.65 1.50
> 958 1.14 0.81 1.40
> 1024 1.19 0.86 1.38
> 2048 1.69 1.34 1.26
> Signed-off-by: Ling Ma <ling.ml@xxxxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/alternative-asm.h | 4 +-
> arch/x86/lib/memset_64.S | 172
> +++++++++++++++++++++------------
> 2 files changed, 110 insertions(+), 66 deletions(-)
>
> diff --git a/arch/x86/include/asm/alternative-asm.h
> b/arch/x86/include/asm/alternative-asm.h
> index 372231c..aaac545 100644
> --- a/arch/x86/include/asm/alternative-asm.h
> +++ b/arch/x86/include/asm/alternative-asm.h
> @@ -22,8 +22,8 @@
> .long \orig - .
> .long \alt - .
> .word \feature
> - .byte \orig_len
> - .byte \alt_len
> + .word \orig_len
> + .word \alt_len
> .endm
>
> #endif /* __ASSEMBLY__ */
> diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
> index 2dcb380..3eca27c 100644
> --- a/arch/x86/lib/memset_64.S
> +++ b/arch/x86/lib/memset_64.S
> @@ -59,77 +59,121 @@
> ENTRY(memset)
> ENTRY(__memset)
> CFI_STARTPROC
> - movq %rdi,%r10
> -
> - /* expand byte value */
> movzbl %sil,%ecx
> - movabs $0x0101010101010101,%rax
> - imulq %rcx,%rax
> -
> - /* align dst */
> - movl %edi,%r9d
> - andl $7,%r9d
> - jnz .Lbad_alignment
> - CFI_REMEMBER_STATE
> -.Lafter_bad_alignment:
> -
> - movq %rdx,%rcx
> - shrq $6,%rcx
> - jz .Lhandle_tail
> -
> + mov $0x0101010101010101,%rsi
> + imulq %rsi,%rcx
> + movq %rdi,%rax
> + lea (%rdi, %rdx), %r8
> + cmp $128, %rdx
> + ja .Lmore128bytes
> + cmp $64, %edx
> + jb .Lless_64bytes
> + /*
> + * Move data from 65 bytes to 128 bytes.
> + */
> + mov %rcx, 0x00(%rdi)
> + mov %rcx, 0x08(%rdi)
> + mov %rcx, 0x10(%rdi)
> + mov %rcx, 0x18(%rdi)
> + mov %rcx, 0x20(%rdi)
> + mov %rcx, 0x28(%rdi)
> + mov %rcx, 0x30(%rdi)
> + mov %rcx, 0x38(%rdi)
> + mov %rcx, -0x40(%r8)
> + mov %rcx, -0x38(%r8)
> + mov %rcx, -0x30(%r8)
> + mov %rcx, -0x28(%r8)
> + mov %rcx, -0x20(%r8)
> + mov %rcx, -0x18(%r8)
> + mov %rcx, -0x10(%r8)
> + mov %rcx, -0x08(%r8)
> + ret
> .p2align 4
> -.Lloop_64:
> - decq %rcx
> - movq %rax,(%rdi)
> - movq %rax,8(%rdi)
> - movq %rax,16(%rdi)
> - movq %rax,24(%rdi)
> - movq %rax,32(%rdi)
> - movq %rax,40(%rdi)
> - movq %rax,48(%rdi)
> - movq %rax,56(%rdi)
> - leaq 64(%rdi),%rdi
> - jnz .Lloop_64
> -
> - /* Handle tail in loops. The loops should be faster than hard
> - to predict jump tables. */
> +.Lless_64bytes:
> + cmp $32, %edx
> + jb .Lless_32bytes
> + /*
> + * Move data from 33 bytes to 64 bytes.
> + */
> + mov %rcx, 0x00(%rdi)
> + mov %rcx, 0x08(%rdi)
> + mov %rcx, 0x10(%rdi)
> + mov %rcx, 0x18(%rdi)
> + mov %rcx, -0x20(%r8)
> + mov %rcx, -0x18(%r8)
> + mov %rcx, -0x10(%r8)
> + mov %rcx, -0x08(%r8)
> + ret
> .p2align 4
> -.Lhandle_tail:
> - movl %edx,%ecx
> - andl $63&(~7),%ecx
> - jz .Lhandle_7
> - shrl $3,%ecx
> +.Lless_32bytes:
> + cmp $16, %edx
> + jb .Lless_16bytes
> + mov %rcx, 0x00(%rdi)
> + mov %rcx, 0x08(%rdi)
> + mov %rcx, -0x10(%r8)
> + mov %rcx, -0x08(%r8)
> + ret
> .p2align 4
> -.Lloop_8:
> - decl %ecx
> - movq %rax,(%rdi)
> - leaq 8(%rdi),%rdi
> - jnz .Lloop_8
> -
> -.Lhandle_7:
> - andl $7,%edx
> - jz .Lende
> +.Lless_16bytes:
> + cmp $8, %edx
> + jb .Lless_8bytes
> + mov %rcx, (%rdi)
> + mov %rcx, -0x08(%r8)
> + ret
> .p2align 4
> -.Lloop_1:
> - decl %edx
> - movb %al,(%rdi)
> - leaq 1(%rdi),%rdi
> - jnz .Lloop_1
> -
> -.Lende:
> - movq %r10,%rax
> +.Lless_8bytes:
> + cmp $4, %edx
> + jb .Lless_4bytes
> + mov %ecx, (%rdi)
> + mov %ecx, -0x04(%r8)
> + .p2align 4
> +.Lless_4bytes:
> + cmp $2, %edx
> + jb .Lless_2bytes
> + mov %cx, (%rdi)
> + mov %cx, -0x02(%r8)
> + ret
> + .p2align 4
> +.Lless_2bytes:
> + cmp $1, %edx
> + jb .Lless_1bytes
> + mov %cl, (%rdi)
> +.Lless_1bytes:
> ret
>
> - CFI_RESTORE_STATE
> -.Lbad_alignment:
> - cmpq $7,%rdx
> - jbe .Lhandle_7
> - movq %rax,(%rdi) /* unaligned store */
> - movq $8,%r8
> - subq %r9,%r8
> - addq %r8,%rdi
> - subq %r8,%rdx
> - jmp .Lafter_bad_alignment
> + .p2align 4
> +.Lmore128bytes:
> + mov %rcx, (%rdi)
> + mov %rdi, %r9
> + and $-0x08, %rdi
> + add $0x08, %rdi
> + sub %rdi, %r9
> + add %r9, %rdx
> + sub $0x40, %rdx
> +.Lgobble_64_loop:
> + mov %rcx, 0x00(%rdi)
> + mov %rcx, 0x08(%rdi)
> + mov %rcx, 0x10(%rdi)
> + mov %rcx, 0x18(%rdi)
> + mov %rcx, 0x20(%rdi)
> + mov %rcx, 0x28(%rdi)
> + mov %rcx, 0x30(%rdi)
> + mov %rcx, 0x38(%rdi)
> + lea 0x40(%rdi), %rdi
> + sub $0x40, %rdx
> + jae .Lgobble_64_loop
> + /*
> + * Move data from 0 bytes to 63 bytes.
> + */
> + mov %rcx, -0x40(%r8)
> + mov %rcx, -0x38(%r8)
> + mov %rcx, -0x30(%r8)
> + mov %rcx, -0x28(%r8)
> + mov %rcx, -0x20(%r8)
> + mov %rcx, -0x18(%r8)
> + mov %rcx, -0x10(%r8)
> + mov %rcx, -0x08(%r8)
> + ret
> .Lfinal:
> CFI_ENDPROC
> ENDPROC(memset)
> --
> 1.8.1.4
>
>

Attachment: memset_kernel.tar
Description: Unix tar archive