diff options
author | Andi Kleen <ak@suse.de> | 2006-02-03 21:51:02 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-02-04 16:43:13 -0800 |
commit | 7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch) | |
tree | f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memset.S | |
parent | 6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff) | |
download | blackbird-op-linux-7bcd3f34e262bbebffa954d80eab3a84f053da31.tar.gz blackbird-op-linux-7bcd3f34e262bbebffa954d80eab3a84f053da31.zip |
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions
They cause quite bad performance regressions on Netburst
This is temporary until we can get new optimized functions
for these CPUs.
This undoes changes that were done in 2.6.15 and in 2.6.16-rc1,
essentially bringing the code back to 2.6.14 level. Only change
is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD
and fixed the check for the flag and also fixed some comments.
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/memset.S')
-rw-r--r-- | arch/x86_64/lib/memset.S | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1e..ad397f2c7de8 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -13,6 +13,98 @@ .p2align 4 memset: __memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,3 +121,5 @@ __memset: stosb movq %r9,%rax ret +memset_c_end: + .previous |