diff options
author | Anton Blanchard <anton@samba.org> | 2015-01-21 12:27:38 +1100 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2015-01-23 14:02:55 +1100 |
commit | 15c2d45d17418cc4a712608c78ff3b5f0583d83b (patch) | |
tree | 53e4ee00f5e0b604ee7451ee6e229751043ae0f6 /arch/powerpc/lib/memcmp_64.S | |
parent | a113de373bcb7651196e29a49483c8e24e1e6aa9 (diff) | |
download | talos-obmc-linux-15c2d45d17418cc4a712608c78ff3b5f0583d83b.tar.gz talos-obmc-linux-15c2d45d17418cc4a712608c78ff3b5f0583d83b.zip |
powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.
Optimise the loop in a few ways:
- Unroll the byte at a time loop
- For large (at least 32 byte) comparisons that are also 8 byte
aligned, use an unrolled modulo scheduled loop using 8 byte
loads. This is similar to our glibc memcmp.
A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:
baseline: 29.93 s
modified: 1.70 s
Just over 17x faster.
v2: Incorporated some suggestions from Segher:
- Use andi. instead of rdlicl.
- Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
and was a relic from a previous version.
- Don't use cr5, we have plans to use that CR field for fast local
atomics.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/lib/memcmp_64.S')
-rw-r--r-- | arch/powerpc/lib/memcmp_64.S | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S new file mode 100644 index 000000000000..8953d2382a65 --- /dev/null +++ b/arch/powerpc/lib/memcmp_64.S @@ -0,0 +1,233 @@ +/* + * Author: Anton Blanchard <anton@au.ibm.com> + * Copyright 2015 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc_asm.h> + +#define off8 r6 +#define off16 r7 +#define off24 r8 + +#define rA r9 +#define rB r10 +#define rC r11 +#define rD r27 +#define rE r28 +#define rF r29 +#define rG r30 +#define rH r31 + +#ifdef __LITTLE_ENDIAN__ +#define LD ldbrx +#else +#define LD ldx +#endif + +_GLOBAL(memcmp) + cmpdi cr1,r5,0 + + /* Use the short loop if both strings are not 8B aligned */ + or r6,r3,r4 + andi. r6,r6,7 + + /* Use the short loop if length is less than 32B */ + cmpdi cr6,r5,31 + + beq cr1,.Lzero + bne .Lshort + bgt cr6,.Llong + +.Lshort: + mtctr r5 + +1: lbz rA,0(r3) + lbz rB,0(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,1(r3) + lbz rB,1(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,2(r3) + lbz rB,2(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,3(r3) + lbz rB,3(r4) + subf. rC,rB,rA + bne .Lnon_zero + + addi r3,r3,4 + addi r4,r4,4 + + bdnz 1b + +.Lzero: + li r3,0 + blr + +.Lnon_zero: + mr r3,rC + blr + +.Llong: + li off8,8 + li off16,16 + li off24,24 + + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + + srdi r0,r5,5 + mtctr r0 + andi. r5,r5,31 + + LD rA,0,r3 + LD rB,0,r4 + + LD rC,off8,r3 + LD rD,off8,r4 + + LD rE,off16,r3 + LD rF,off16,r4 + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lfirst32 + + LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lsecond32 + + .balign 16 + +1: LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdnz 1b + +.Lsecond32: + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + +.Ltail: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + + cmpdi r5,0 + beq .Lzero + b .Lshort + +.Lfirst32: + cmpld cr1,rC,rD + cmpld cr6,rE,rF + cmpld cr7,rG,rH + + bne cr0,.LcmpAB + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + + b .Ltail + +.LcmpAB: + li r3,1 + bgt cr0,.Lout + li r3,-1 + b .Lout + +.LcmpCD: + li r3,1 + bgt cr1,.Lout + li r3,-1 + b .Lout + +.LcmpEF: + li r3,1 + bgt cr6,.Lout + li r3,-1 + b .Lout + +.LcmpGH: + li r3,1 + bgt cr7,.Lout + li r3,-1 + +.Lout: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + blr |