diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-05 12:22:23 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-05 12:22:23 -0700 |
commit | 8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73 (patch) | |
tree | 74bc746a4f18eeddfc085b76c776ddcf2c798cfa /arch/x86/crypto | |
parent | 59005b0c59a164101b0273e4bda212c809dc2246 (diff) | |
parent | 035f901eac4d2d0fd40f3055026355d55d46949f (diff) | |
download | talos-obmc-linux-8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73.tar.gz talos-obmc-linux-8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73.zip |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"Algorithms:
- add private key generation to ecdh
Drivers:
- add generic gcm(aes) to aesni-intel
- add SafeXcel EIP197 crypto engine driver
- add ecb(aes), cfb(aes) and ecb(des3_ede) to cavium
- add support for CNN55XX adapters in cavium
- add ctr mode to chcr
- add support for gcm(aes) to omap"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (140 commits)
crypto: testmgr - Reenable sha1/aes in FIPS mode
crypto: ccp - Release locks before returning
crypto: cavium/nitrox - dma_mapping_error() returns bool
crypto: doc - fix typo in docs
Documentation/bindings: Document the SafeXel cryptographic engine driver
crypto: caam - fix gfp allocation flags (part II)
crypto: caam - fix gfp allocation flags (part I)
crypto: drbg - Fixes panic in wait_for_completion call
crypto: caam - make of_device_ids const.
crypto: vmx - remove unnecessary check
crypto: n2 - make of_device_ids const
crypto: inside-secure - use the base_end pointer in ring rollback
crypto: inside-secure - increase the batch size
crypto: inside-secure - only dequeue when needed
crypto: inside-secure - get the backlog before dequeueing the request
crypto: inside-secure - stop requeueing failed requests
crypto: inside-secure - use one queue per hw ring
crypto: inside-secure - update the context and request later
crypto: inside-secure - align the cipher and hash send functions
crypto: inside-secure - optimize DSE bufferability control
...
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/aes-x86_64-asm_64.S | 47 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 231 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_avx-x86_64.S | 283 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 208 | ||||
-rw-r--r-- | arch/x86/crypto/glue_helper.c | 3 | ||||
-rw-r--r-- | arch/x86/crypto/sha512-mb/sha512_mb.c | 7 |
6 files changed, 565 insertions, 214 deletions
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 910565547163..8739cf7795de 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -42,17 +42,15 @@ #define R5E %esi #define R6 %rdi #define R6E %edi -#define R7 %rbp -#define R7E %ebp +#define R7 %r9 /* don't use %rbp; it breaks stack traces */ +#define R7E %r9d #define R8 %r8 -#define R9 %r9 #define R10 %r10 #define R11 %r11 -#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC); \ movq r1,r2; \ - movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ @@ -70,9 +68,8 @@ je B192; \ leaq 32(r9),r9; -#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ movq r1,r2; \ - movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ @@ -88,12 +85,12 @@ movl TAB(,r6,4),r6 ## E; \ roll $16,r2 ## E; \ shrl $16,r4 ## E; \ - movzbl r4 ## H,r7 ## E; \ - movzbl r4 ## L,r4 ## E; \ + movzbl r4 ## L,r7 ## E; \ + movzbl r4 ## H,r4 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r7,4),r5 ## E;\ - xorl TAB+2048(,r4,4),r6 ## E;\ + xorl TAB+3072(,r4,4),r5 ## E;\ + xorl TAB+2048(,r7,4),r6 ## E;\ movzbl r1 ## L,r7 ## E; \ movzbl r1 ## H,r4 ## E; \ movl TAB+1024(,r4,4),r4 ## E;\ @@ -101,19 +98,19 @@ roll $16,r1 ## E; \ shrl $16,r3 ## E; \ xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## H,r7 ## E; \ - movzbl r3 ## L,r3 ## E; \ - xorl TAB+3072(,r7,4),r4 ## E;\ - xorl TAB+2048(,r3,4),r5 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r3 ## E; \ + movzbl r3 ## L,r7 ## E; \ + movzbl r3 ## H,r3 ## E; \ + xorl TAB+3072(,r3,4),r4 ## E;\ + xorl TAB+2048(,r7,4),r5 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r3 ## E; \ shrl $16,r1 ## E; \ - xorl TAB+3072(,r7,4),r6 ## E;\ - movl TAB+2048(,r3,4),r3 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r1 ## E; \ - xorl TAB+1024(,r7,4),r6 ## E;\ - xorl TAB(,r1,4),r3 ## E; \ + xorl TAB+3072(,r3,4),r6 ## E;\ + movl TAB+2048(,r7,4),r3 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r1 ## E; \ + xorl TAB+1024(,r1,4),r6 ## E;\ + xorl TAB(,r7,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ shrl $16,r2 ## E; \ @@ -131,9 +128,9 @@ movl r4 ## E,r2 ## E; #define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) -#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3c465184ff8a..16627fec80b2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + .text @@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i + pxor \XMM2, \XMM2 -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation - -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + pxor \XMM2, \XMM2 + + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some PT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -1454,18 +1549,35 @@ _return_T_decrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_decrypt - cmp $12, %r11 - je _T_12_decrypt + cmp $8, %r11 + jl _T_4_decrypt _T_8_decrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_decrypt -_T_12_decrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_4_decrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_123_decrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_decrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_decrypt + add $2, %r10 + sar $16, %eax +_T_1_decrypt: + mov %al, (%r10) jmp _return_T_done_decrypt _T_16_decrypt: movdqu %xmm0, (%r10) @@ -1718,18 +1830,35 @@ _return_T_encrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_encrypt - cmp $12, %r11 - je _T_12_encrypt + cmp $8, %r11 + jl _T_4_encrypt _T_8_encrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_encrypt -_T_12_encrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_4_encrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_123_encrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_encrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_encrypt + add $2, %r10 + sar $16, %eax +_T_1_encrypt: + mov %al, (%r10) jmp _return_T_done_encrypt _T_16_encrypt: movdqu %xmm0, (%r10) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index d664382c6e56..faecb1518bf8 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -155,6 +155,30 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + + .text @@ -372,41 +396,72 @@ VARIABLE_OFFSET = 16*8 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -480,7 +535,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -1427,19 +1481,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: @@ -1631,41 +1702,73 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -1740,7 +1843,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -2702,19 +2804,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 93de8ea51548..4a55cdcdc008 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -61,6 +61,11 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; +struct generic_gcmaes_ctx { + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +}; + struct aesni_xts_ctx { u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; @@ -102,13 +107,11 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, * u8 *out, Ciphertext output. Encrypt in-place is allowed. * const u8 *in, Plaintext input * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this - * is going to be 8 or 12 bytes + * unsigned long aad_len, Length of AAD in bytes. * u8 *auth_tag, Authenticated Tag output. * unsigned long auth_tag_len), Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. @@ -123,9 +126,8 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, * u8 *out, Plaintext output. Decrypt in-place is allowed. * const u8 *in, Ciphertext input * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going @@ -275,6 +277,16 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) align = 1; return PTR_ALIGN(crypto_aead_ctx(tfm), align); } + +static inline struct +generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) +{ + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); +} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -712,32 +724,34 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int helper_rfc4106_encrypt(struct aead_request *req) +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -768,7 +782,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); @@ -791,37 +805,20 @@ static int helper_rfc4106_encrypt(struct aead_request *req) return 0; } -static int helper_rfc4106_decrypt(struct aead_request *req) +static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; unsigned long tempCipherLen = 0; - __be32 counter = cpu_to_be32(1); - int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); u8 authTag[16]; struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ + int retval = 0; tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -838,7 +835,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) scatterwalk_start(&dst_sg_walk, req->dst); dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); @@ -850,9 +846,10 @@ static int helper_rfc4106_decrypt(struct aead_request *req) dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, authTag, auth_tag_len); kernel_fpu_end(); @@ -875,6 +872,60 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kfree(assoc); } return retval; + +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + __be32 counter = cpu_to_be32(1); + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 16 or 20 bytes */ + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); } static int rfc4106_encrypt(struct aead_request *req) @@ -1035,6 +1086,46 @@ struct { }; #ifdef CONFIG_X86_64 +static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) +{ + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +} + +static int generic_gcmaes_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + __be32 counter = cpu_to_be32(1); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + +static int generic_gcmaes_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + static struct aead_alg aesni_aead_algs[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, @@ -1069,6 +1160,23 @@ static struct aead_alg aesni_aead_algs[] = { { .cra_ctxsize = sizeof(struct cryptd_aead *), .cra_module = THIS_MODULE, }, +}, { + .setkey = generic_gcmaes_set_key, + .setauthsize = generic_gcmaes_set_authsize, + .encrypt = generic_gcmaes_encrypt, + .decrypt = generic_gcmaes_decrypt, + .ivsize = 12, + .maxauthsize = 16, + .base = { + .cra_name = "gcm(aes)", + .cra_driver_name = "generic-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, } }; #else static struct aead_alg aesni_aead_algs[0]; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 24ac9fad832d..d61e57960fe0 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -176,9 +176,6 @@ __glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, src -= 1; dst -= 1; } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; } } diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 2dd3674b5a1e..458409b7568d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -269,19 +269,19 @@ static struct sha512_hash_ctx * LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; - return ctx; + goto unlock; } if (ctx->status & HASH_CTX_STS_PROCESSING) { /* Cannot submit to a currently processing job. */ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; - return ctx; + goto unlock; } if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; - return ctx; + goto unlock; } @@ -363,6 +363,7 @@ static struct sha512_hash_ctx } ctx = sha512_ctx_mgr_resubmit(mgr, ctx); +unlock: spin_unlock_irqrestore(&cstate->work_lock, irqflags); return ctx; } |