diff options
author | Michael Ellerman <michael@ellerman.id.au> | 2012-09-13 23:00:49 +0000 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2013-01-10 14:43:45 +1100 |
commit | 323a6bf1d6f4ec7907d9d8aacb4ae9590f755dda (patch) | |
tree | 90b75e02d5d9eb74b424155ae726eef5e7f16ff3 /arch/powerpc/crypto/sha1-powerpc-asm.S | |
parent | 5c49985c21bba4d2f899e3a97121868a5c58a876 (diff) | |
download | talos-op-linux-323a6bf1d6f4ec7907d9d8aacb4ae9590f755dda.tar.gz talos-op-linux-323a6bf1d6f4ec7907d9d8aacb4ae9590f755dda.zip |
powerpc: Add a powerpc implementation of SHA-1
This patch adds a crypto driver which provides a powerpc accelerated
implementation of SHA-1, accelerated in that it is written in asm.
Original patch by Paul, minor fixups for upstream by moi.
Lightly tested on 64-bit with the test program here:
http://michael.ellerman.id.au/files/junkcode/sha1test.c
Seems to work, and is "not slower" than the generic version.
Needs testing on 32-bit.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/crypto/sha1-powerpc-asm.S')
-rw-r--r-- | arch/powerpc/crypto/sha1-powerpc-asm.S | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S new file mode 100644 index 000000000000..a5f8264d2d3c --- /dev/null +++ b/arch/powerpc/crypto/sha1-powerpc-asm.S @@ -0,0 +1,179 @@ +/* + * SHA-1 implementation for PowerPC. + * + * Copyright (C) 2005 Paul Mackerras <paulus@samba.org> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* + * We roll the registers for T, A, B, C, D, E around on each + * iteration; T on iteration t is A on iteration t+1, and so on. + * We use registers 7 - 12 for this. + */ +#define RT(t) ((((t)+5)%6)+7) +#define RA(t) ((((t)+4)%6)+7) +#define RB(t) ((((t)+3)%6)+7) +#define RC(t) ((((t)+2)%6)+7) +#define RD(t) ((((t)+1)%6)+7) +#define RE(t) ((((t)+0)%6)+7) + +/* We use registers 16 - 31 for the W values */ +#define W(t) (((t)%16)+16) + +#define LOADW(t) \ + lwz W(t),(t)*4(r4) + +#define STEPD0_LOAD(t) \ + andc r0,RD(t),RB(t); \ + and r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r14,r0,W(t); \ + lwz W((t)+4),((t)+4)*4(r4); \ + rotlwi RB(t),RB(t),30; \ + add RT(t),RT(t),r14 + +#define STEPD0_UPDATE(t) \ + and r6,RB(t),RC(t); \ + andc r0,RD(t),RB(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + or r6,r6,r0; \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD1(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + add RT(t),RT(t),r0 + +#define STEPD1_UPDATE(t) \ + xor r6,RB(t),RC(t); \ + rotlwi RT(t),RA(t),5; \ + rotlwi RB(t),RB(t),30; \ + xor r6,r6,RD(t); \ + add r0,RE(t),r15; \ + xor r5,W((t)+4-3),W((t)+4-8); \ + add RT(t),RT(t),r6; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEPD2_UPDATE(t) \ + and r6,RB(t),RC(t); \ + and r0,RB(t),RD(t); \ + rotlwi RT(t),RA(t),5; \ + or r6,r6,r0; \ + rotlwi RB(t),RB(t),30; \ + and r0,RC(t),RD(t); \ + xor r5,W((t)+4-3),W((t)+4-8); \ + or r6,r6,r0; \ + xor W((t)+4),W((t)+4-16),W((t)+4-14); \ + add r0,RE(t),r15; \ + add RT(t),RT(t),r6; \ + add r0,r0,W(t); \ + xor W((t)+4),W((t)+4),r5; \ + add RT(t),RT(t),r0; \ + rotlwi W((t)+4),W((t)+4),1 + +#define STEP0LD4(t) \ + STEPD0_LOAD(t); \ + STEPD0_LOAD((t)+1); \ + STEPD0_LOAD((t)+2); \ + STEPD0_LOAD((t)+3) + +#define STEPUP4(t, fn) \ + STEP##fn##_UPDATE(t); \ + STEP##fn##_UPDATE((t)+1); \ + STEP##fn##_UPDATE((t)+2); \ + STEP##fn##_UPDATE((t)+3) + +#define STEPUP20(t, fn) \ + STEPUP4(t, fn); \ + STEPUP4((t)+4, fn); \ + STEPUP4((t)+8, fn); \ + STEPUP4((t)+12, fn); \ + STEPUP4((t)+16, fn) + +_GLOBAL(powerpc_sha_transform) + PPC_STLU r1,-STACKFRAMESIZE(r1) + SAVE_8GPRS(14, r1) + SAVE_10GPRS(22, r1) + + /* Load up A - E */ + lwz RA(0),0(r3) /* A */ + lwz RB(0),4(r3) /* B */ + lwz RC(0),8(r3) /* C */ + lwz RD(0),12(r3) /* D */ + lwz RE(0),16(r3) /* E */ + + LOADW(0) + LOADW(1) + LOADW(2) + LOADW(3) + + lis r15,0x5a82 /* K0-19 */ + ori r15,r15,0x7999 + STEP0LD4(0) + STEP0LD4(4) + STEP0LD4(8) + STEPUP4(12, D0) + STEPUP4(16, D0) + + lis r15,0x6ed9 /* K20-39 */ + ori r15,r15,0xeba1 + STEPUP20(20, D1) + + lis r15,0x8f1b /* K40-59 */ + ori r15,r15,0xbcdc + STEPUP20(40, D2) + + lis r15,0xca62 /* K60-79 */ + ori r15,r15,0xc1d6 + STEPUP4(60, D1) + STEPUP4(64, D1) + STEPUP4(68, D1) + STEPUP4(72, D1) + lwz r20,16(r3) + STEPD1(76) + lwz r19,12(r3) + STEPD1(77) + lwz r18,8(r3) + STEPD1(78) + lwz r17,4(r3) + STEPD1(79) + + lwz r16,0(r3) + add r20,RE(80),r20 + add RD(0),RD(80),r19 + add RC(0),RC(80),r18 + add RB(0),RB(80),r17 + add RA(0),RA(80),r16 + mr RE(0),r20 + stw RA(0),0(r3) + stw RB(0),4(r3) + stw RC(0),8(r3) + stw RD(0),12(r3) + stw RE(0),16(r3) + + REST_8GPRS(14, r1) + REST_10GPRS(22, r1) + addi r1,r1,STACKFRAMESIZE + blr |