diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2011-09-02 01:45:22 +0300 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2011-09-22 21:25:26 +1000 |
commit | 64b94ceae8c16cd1b2800cac83112d3815be5250 (patch) | |
tree | c7e3384659522cac32dc85a34e4ed722346a0f91 /arch/x86/crypto/blowfish-x86_64-asm_64.S | |
parent | 7d47b86cfef808c6580b7603c3f17fcaf27e9d14 (diff) | |
download | blackbird-op-linux-64b94ceae8c16cd1b2800cac83112d3815be5250.tar.gz blackbird-op-linux-64b94ceae8c16cd1b2800cac83112d3815be5250.zip |
crypto: blowfish - add x86_64 assembly implementation
Patch adds x86_64 assembly implementation of blowfish. Two set of assembler
functions are provided. First set is regular 'one-block at time'
encrypt/decrypt functions. Second is 'four-block at time' functions that
gain performance increase on out-of-order CPUs. Performance of 4-way
functions should be equal to 1-way functions with in-order CPUs.
Summary of the tcrypt benchmarks:
Blowfish assembler vs blowfish C (256bit 8kb block ECB)
encrypt: 2.2x speed
decrypt: 2.3x speed
Blowfish assembler vs blowfish C (256bit 8kb block CBC)
encrypt: 1.12x speed
decrypt: 2.5x speed
Blowfish assembler vs blowfish C (256bit 8kb block CTR)
encrypt: 2.5x speed
Full output:
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-asm-x86_64.txt
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-c-x86_64.txt
Tests were run on:
vendor_id : AuthenticAMD
cpu family : 16
model : 10
model name : AMD Phenom(tm) II X6 1055T Processor
stepping : 0
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/blowfish-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/blowfish-x86_64-asm_64.S | 392 |
1 files changed, 392 insertions, 0 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S new file mode 100644 index 000000000000..44eb23ab9676 --- /dev/null +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -0,0 +1,392 @@ +/* + * Blowfish Cipher Algorithm (x86_64) + * + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "blowfish-x86_64-asm.S" +.text + +/* structure of crypto context */ +#define p 0 +#define s0 ((16 + 2) * 4) +#define s1 ((16 + 2 + (1 * 256)) * 4) +#define s2 ((16 + 2 + (2 * 256)) * 4) +#define s3 ((16 + 2 + (3 * 256)) * 4) + +/* register macros */ +#define CTX %rdi +#define RIO %rsi + +#define RX0 %rax +#define RX1 %rbx +#define RX2 %rcx +#define RX3 %rdx + +#define RX0d %eax +#define RX1d %ebx +#define RX2d %ecx +#define RX3d %edx + +#define RX0bl %al +#define RX1bl %bl +#define RX2bl %cl +#define RX3bl %dl + +#define RX0bh %ah +#define RX1bh %bh +#define RX2bh %ch +#define RX3bh %dh + +#define RT0 %rbp +#define RT1 %rsi + +#define RT0d %ebp +#define RT1d %esi + +#define RK0 %r8 +#define RK1 %r9 +#define RK2 %r10 +#define RK3 %r11 + +#define RK0d %r8d +#define RK1d %r9d +#define RK2d %r10d +#define RK3d %r11d + +#define RKEY %r12 + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F(x, k) \ + rorq $16, x; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT1d; \ + rolq $16, x; \ + movl s0(CTX,RT0,4), k ## d; \ + addl s1(CTX,RT1,4), k ## d; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT1d; \ + rolq $32, x; \ + xorl s2(CTX,RT0,4), k ## d; \ + addl s3(CTX,RT1,4), k ## d; \ + xorq k, x; + +#define add_roundkey_enc(n) \ + xorq p+4*(n)(CTX), RX0; + +#define round_enc(n) \ + add_roundkey_enc(n); \ + \ + F(RX0, RK0); \ + F(RX0, RK0); + +#define round_final_enc(n) \ + xorq p+4*(n)(CTX), RX0; + +#define add_roundkey_dec(n) \ + movq p+4*(n-1)(CTX), RT0; \ + rorq $32, RT0; \ + xorq RT0, RX0; + +#define round_dec(n) \ + add_roundkey_dec(n); \ + \ + F(RX0, RK0); \ + F(RX0, RK0); \ + +#define read_block() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; + +#define write_block() \ + bswapq RX0; \ + movq RX0, (RIO); + +#define xor_block() \ + bswapq RX0; \ + xorq RX0, (RIO); + +.align 8 +.global __blowfish_enc_blk +.type __blowfish_enc_blk,@function; + +__blowfish_enc_blk: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + // %rcx: bool xor + pushq %rbp; + pushq %rbx; + + pushq %rsi; + pushq %rcx; + movq %rdx, RIO; + + read_block(); + + round_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + add_roundkey_enc(16); + + popq %rbp; + popq RIO; + + test %bpl, %bpl; + jnz __enc_xor; + + write_block(); + +__enc_ret: + popq %rbx; + popq %rbp; + + ret; + +__enc_xor: + xor_block(); + + jmp __enc_ret; + +.align 8 +.global blowfish_dec_blk +.type blowfish_dec_blk,@function; + +blowfish_dec_blk: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + pushq %rbp; + pushq %rbx; + + pushq %rsi; + movq %rdx, RIO; + + read_block(); + + round_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + add_roundkey_dec(1); + + popq RIO; + write_block(); + + popq %rbx; + popq %rbp; + + ret; + +/********************************************************************** + 4-way blowfish, four blocks parallel + **********************************************************************/ +#define add_preloaded_roundkey4() \ + xorq RKEY, RX0; \ + xorq RKEY, RX1; \ + xorq RKEY, RX2; \ + xorq RKEY, RX3; + +#define preload_roundkey_enc(n) \ + movq p+4*(n)(CTX), RKEY; + +#define add_roundkey_enc4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_enc(n + 2); + +#define round_enc4(n) \ + add_roundkey_enc4(n); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); + +#define preload_roundkey_dec(n) \ + movq p+4*((n)-1)(CTX), RKEY; \ + rorq $32, RKEY; + +#define add_roundkey_dec4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_dec(n - 2); + +#define round_dec4(n) \ + add_roundkey_dec4(n); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); + +#define read_block4() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; \ + \ + movq 8(RIO), RX1; \ + rorq $32, RX1; \ + bswapq RX1; \ + \ + movq 16(RIO), RX2; \ + rorq $32, RX2; \ + bswapq RX2; \ + \ + movq 24(RIO), RX3; \ + rorq $32, RX3; \ + bswapq RX3; + +#define write_block4() \ + bswapq RX0; \ + movq RX0, (RIO); \ + \ + bswapq RX1; \ + movq RX1, 8(RIO); \ + \ + bswapq RX2; \ + movq RX2, 16(RIO); \ + \ + bswapq RX3; \ + movq RX3, 24(RIO); + +#define xor_block4() \ + bswapq RX0; \ + xorq RX0, (RIO); \ + \ + bswapq RX1; \ + xorq RX1, 8(RIO); \ + \ + bswapq RX2; \ + xorq RX2, 16(RIO); \ + \ + bswapq RX3; \ + xorq RX3, 24(RIO); + +.align 8 +.global __blowfish_enc_blk_4way +.type __blowfish_enc_blk_4way,@function; + +__blowfish_enc_blk_4way: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + // %rcx: bool xor + pushq %rbp; + pushq %rbx; + pushq RKEY; + preload_roundkey_enc(0); + + pushq %rsi; + pushq %rcx; + movq %rdx, RIO; + + read_block4(); + + round_enc4(0); + round_enc4(2); + round_enc4(4); + round_enc4(6); + round_enc4(8); + round_enc4(10); + round_enc4(12); + round_enc4(14); + add_preloaded_roundkey4(); + + popq %rbp; + popq RIO; + + test %bpl, %bpl; + jnz __enc_xor4; + + write_block4(); + +__enc_ret4: + popq RKEY; + popq %rbx; + popq %rbp; + + ret; + +__enc_xor4: + xor_block4(); + + jmp __enc_ret4; + +.align 8 +.global blowfish_dec_blk_4way +.type blowfish_dec_blk_4way,@function; + +blowfish_dec_blk_4way: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + pushq %rbp; + pushq %rbx; + pushq RKEY; + preload_roundkey_dec(17); + + pushq %rsi; + movq %rdx, RIO; + + read_block4(); + + round_dec4(17); + round_dec4(15); + round_dec4(13); + round_dec4(11); + round_dec4(9); + round_dec4(7); + round_dec4(5); + round_dec4(3); + add_preloaded_roundkey4(); + + popq RIO; + write_block4(); + + popq RKEY; + popq %rbx; + popq %rbp; + + ret; + |