diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-15 16:01:47 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-15 16:01:47 -0700 |
commit | dafa5f6577a9eecd2941add553d1672c30b02364 (patch) | |
tree | ff9d3d2dffafd6eba1b6ac21ba50623812041b70 /arch/arm64/crypto | |
parent | 9a76aba02a37718242d7cdc294f0a3901928aa57 (diff) | |
parent | 22240df7ac6d76a271197571a7be45addef2ba15 (diff) | |
download | talos-obmc-linux-dafa5f6577a9eecd2941add553d1672c30b02364.tar.gz talos-obmc-linux-dafa5f6577a9eecd2941add553d1672c30b02364.zip |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"API:
- Fix dcache flushing crash in skcipher.
- Add hash finup self-tests.
- Reschedule during speed tests.
Algorithms:
- Remove insecure vmac and replace it with vmac64.
- Add public key verification for DH/ECDH.
Drivers:
- Decrease priority of sha-mb on x86.
- Improve NEON latency/throughput on ARM64.
- Add md5/sha384/sha512/des/3des to inside-secure.
- Support eip197d in inside-secure.
- Only register algorithms supported by the host in virtio.
- Add cts and remove incompatible cts1 from ccree.
- Add hisilicon SEC security accelerator driver.
- Replace msm hwrng driver with qcom pseudo rng driver.
Misc:
- Centralize CRC polynomials"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (121 commits)
crypto: arm64/ghash-ce - implement 4-way aggregation
crypto: arm64/ghash-ce - replace NEON yield check with block limit
crypto: hisilicon - sec_send_request() can be static
lib/mpi: remove redundant variable esign
crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable
crypto: arm64/aes-ce-gcm - implement 2-way aggregation
crypto: arm64/aes-ce-gcm - operate on two input blocks at a time
crypto: dh - make crypto_dh_encode_key() make robust
crypto: dh - fix calculating encoded key size
crypto: ccp - Check for NULL PSP pointer at module unload
crypto: arm/chacha20 - always use vrev for 16-bit rotates
crypto: ccree - allow bigger than sector XTS op
crypto: ccree - zero all of request ctx before use
crypto: ccree - remove cipher ivgen left overs
crypto: ccree - drop useless type flag during reg
crypto: ablkcipher - fix crash flushing dcache in error path
crypto: blkcipher - fix crash flushing dcache in error path
crypto: skcipher - fix crash flushing dcache in error path
crypto: skcipher - remove unnecessary setting of walk->nbytes
crypto: scatterwalk - remove scatterwalk_samebuf()
...
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 3 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 271 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 204 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 1 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha256-glue.c | 8 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 4 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-glue.c | 1 |
10 files changed, 322 insertions, 176 deletions
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index e3e50950a863..adcb83eb683c 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -567,7 +567,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cmac(aes)", .base.cra_driver_name = "cmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -583,7 +582,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "xcbc(aes)", .base.cra_driver_name = "xcbc-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -599,7 +597,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cbcmac(aes)", .base.cra_driver_name = "cbcmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), .base.cra_module = THIS_MODULE, diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index c723647b37db..1b319b716d5e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -46,6 +46,19 @@ ss3 .req v26 ss4 .req v27 + XL2 .req v8 + XM2 .req v9 + XH2 .req v10 + XL3 .req v11 + XM3 .req v12 + XH3 .req v13 + TT3 .req v14 + TT4 .req v15 + HH .req v16 + HH3 .req v17 + HH4 .req v18 + HH34 .req v19 + .text .arch armv8-a+crypto @@ -134,11 +147,25 @@ .endm .macro __pmull_pre_p64 + add x8, x3, #16 + ld1 {HH.2d-HH4.2d}, [x8] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + movi MASK.16b, #0xe1 shl MASK.2d, MASK.2d, #57 .endm .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff @@ -213,31 +240,88 @@ .endm .macro __pmull_ghash, pn - frame_push 5 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - -0: ld1 {SHASH.2d}, [x22] - ld1 {XL.2d}, [x20] - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - eor SHASH2.16b, SHASH2.16b, SHASH.16b + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] __pmull_pre_\pn /* do the head block first, if supplied */ - cbz x23, 1f - ld1 {T1.2d}, [x23] - mov x23, xzr - b 2f + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 3f + +0: .ifc \pn, p64 + tbnz w0, #0, 2f // skip until #blocks is a + tbnz w0, #1, 2f // round multiple of 4 + +1: ld1 {XM3.16b-TT4.16b}, [x2], #64 + + sub w0, w0, #4 + + rev64 T1.16b, XM3.16b + rev64 T2.16b, XH3.16b + rev64 TT4.16b, TT4.16b + rev64 TT3.16b, TT3.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 + ext XL3.16b, TT3.16b, TT3.16b, #8 + + eor TT4.16b, TT4.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor TT3.16b, TT3.16b, XL3.16b + pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 + pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 + pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) -1: ld1 {T1.2d}, [x21], #16 - sub w19, w19, #1 + ext IN1.16b, T2.16b, T2.16b, #8 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + eor T2.16b, T2.16b, IN1.16b + pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 + pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 + pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) + + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + ext IN1.16b, T1.16b, T1.16b, #8 + ext TT3.16b, XL.16b, XL.16b, #8 + eor XL.16b, XL.16b, IN1.16b + eor T1.16b, T1.16b, TT3.16b + + pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, HH4.1d, XL.1d // a0 * b0 + pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbz w0, 5f + b 1b + .endif -2: /* multiply XL by SHASH in GF(2^128) */ +2: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 + +3: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 @@ -250,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) __pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) - eor T2.16b, XL.16b, XH.16b +4: eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b @@ -259,18 +343,9 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b - cbz w19, 3f - - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w0, 0b -3: st1 {XL.2d}, [x20] - frame_pop +5: st1 {XL.2d}, [x1] ret .endm @@ -286,9 +361,10 @@ ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) - KS .req v8 - CTR .req v9 - INP .req v10 + KS0 .req v12 + KS1 .req v13 + INP0 .req v14 + INP1 .req v15 .macro load_round_keys, rounds, rk cmp \rounds, #12 @@ -322,98 +398,128 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - ld1 {SHASH.2d}, [x4] + ld1 {SHASH.2d}, [x4], #16 + ld1 {HH.2d}, [x4] ld1 {XL.2d}, [x1] ldr x8, [x5, #8] // load lower counter - load_round_keys w7, x6 - movi MASK.16b, #0xe1 - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 - eor SHASH2.16b, SHASH2.16b, SHASH.16b + eor SHASH2.16b, SHASH2.16b, T1.16b .if \enc == 1 ldr x10, [sp] - ld1 {KS.16b}, [x10] + ld1 {KS0.16b-KS1.16b}, [x10] .endif -0: ld1 {CTR.8b}, [x5] // load upper counter - ld1 {INP.16b}, [x3], #16 + cbnz x6, 4f + +0: ld1 {INP0.16b-INP1.16b}, [x3], #32 + rev x9, x8 - add x8, x8, #1 - sub w0, w0, #1 - ins CTR.d[1], x9 // set lower counter + add x11, x8, #1 + add x8, x8, #2 .if \enc == 1 - eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x2], #16 + eor INP0.16b, INP0.16b, KS0.16b // encrypt input + eor INP1.16b, INP1.16b, KS1.16b .endif - rev64 T1.16b, INP.16b + ld1 {KS0.8b}, [x5] // load upper counter + rev x11, x11 + sub w0, w0, #2 + mov KS1.8b, KS0.8b + ins KS0.d[1], x9 // set lower counter + ins KS1.d[1], x11 + + rev64 T1.16b, INP1.16b cmp w7, #12 b.ge 2f // AES-192/256? -1: enc_round CTR, v21 - - ext T2.16b, XL.16b, XL.16b, #8 +1: enc_round KS0, v21 ext IN1.16b, T1.16b, T1.16b, #8 - enc_round CTR, v22 + enc_round KS1, v21 + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + + enc_round KS0, v22 + eor T1.16b, T1.16b, IN1.16b + + enc_round KS1, v22 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + enc_round KS0, v23 + pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + + enc_round KS1, v23 + rev64 T1.16b, INP0.16b + ext T2.16b, XL.16b, XL.16b, #8 + + enc_round KS0, v24 + ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b - eor XL.16b, XL.16b, IN1.16b - enc_round CTR, v23 + enc_round KS1, v24 + eor XL.16b, XL.16b, IN1.16b - pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + enc_round KS0, v25 eor T1.16b, T1.16b, XL.16b - enc_round CTR, v24 + enc_round KS1, v25 + pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 + + enc_round KS0, v26 + pmull XL.1q, HH.1d, XL.1d // a0 * b0 - pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 - pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + enc_round KS1, v26 + pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) - enc_round CTR, v25 + enc_round KS0, v27 + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + enc_round KS1, v27 + eor XM.16b, XM.16b, XM2.16b ext T1.16b, XL.16b, XH.16b, #8 + + enc_round KS0, v28 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b - enc_round CTR, v26 - + enc_round KS1, v28 eor XM.16b, XM.16b, T2.16b - pmull T2.1q, XL.1d, MASK.1d - enc_round CTR, v27 + enc_round KS0, v29 + pmull T2.1q, XL.1d, MASK.1d + enc_round KS1, v29 mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] - enc_round CTR, v28 - + aese KS0.16b, v30.16b eor XL.16b, XM.16b, T2.16b - enc_round CTR, v29 - + aese KS1.16b, v30.16b ext T2.16b, XL.16b, XL.16b, #8 - aese CTR.16b, v30.16b - + eor KS0.16b, KS0.16b, v31.16b pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b - eor KS.16b, CTR.16b, v31.16b - + eor KS1.16b, KS1.16b, v31.16b eor XL.16b, XL.16b, T2.16b .if \enc == 0 - eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x2], #16 + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b .endif + st1 {INP0.16b-INP1.16b}, [x2], #32 + cbnz w0, 0b CPU_LE( rev x8, x8 ) @@ -421,17 +527,24 @@ CPU_LE( rev x8, x8 ) str x8, [x5, #8] // store lower counter .if \enc == 1 - st1 {KS.16b}, [x10] + st1 {KS0.16b-KS1.16b}, [x10] .endif ret 2: b.eq 3f // AES-192? - enc_round CTR, v17 - enc_round CTR, v18 -3: enc_round CTR, v19 - enc_round CTR, v20 + enc_round KS0, v17 + enc_round KS1, v17 + enc_round KS0, v18 + enc_round KS1, v18 +3: enc_round KS0, v19 + enc_round KS1, v19 + enc_round KS0, v20 + enc_round KS1, v20 b 1b + +4: load_round_keys w7, x6 + b 0b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 8a10f1d7199a..6e9f33d14930 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GCM_IV_SIZE 12 struct ghash_key { - u64 a; - u64 b; - be128 k; + u64 h[2]; + u64 h2[2]; + u64 h3[2]; + u64 h4[2]; + + be128 k; }; struct ghash_desc_ctx { @@ -113,6 +116,9 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, } } +/* avoid hogging the CPU for too long */ +#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE) + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -136,11 +142,16 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - ghash_do_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); + do { + int chunk = min(blocks, MAX_BLOCKS); - src += blocks * GHASH_BLOCK_SIZE; - partial = 0; + ghash_do_update(chunk, ctx->digest, src, key, + partial ? ctx->buf : NULL); + + blocks -= chunk; + src += chunk * GHASH_BLOCK_SIZE; + partial = 0; + } while (unlikely(blocks > 0)); } if (len) memcpy(ctx->buf + partial, src, len); @@ -166,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } +static void ghash_reflect(u64 h[], const be128 *k) +{ + u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0; + + h[0] = (be64_to_cpu(k->b) << 1) | carry; + h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63); + + if (carry) + h[1] ^= 0xc200000000000000UL; +} + static int __ghash_setkey(struct ghash_key *key, const u8 *inkey, unsigned int keylen) { - u64 a, b; + be128 h; /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); - /* perform multiplication by 'x' in GF(2^128) */ - b = get_unaligned_be64(inkey); - a = get_unaligned_be64(inkey + 8); + ghash_reflect(key->h, &key->k); - key->a = (a << 1) | (b >> 63); - key->b = (b << 1) | (a >> 63); + h = key->k; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h2, &h); - if (b >> 63) - key->b ^= 0xc200000000000000UL; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h3, &h); + + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h4, &h); return 0; } @@ -204,7 +228,6 @@ static struct shash_alg ghash_alg = { .base.cra_name = "ghash", .base.cra_driver_name = "ghash-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = GHASH_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct ghash_key), .base.cra_module = THIS_MODULE, @@ -245,7 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, num_rounds(&ctx->aes_key)); - return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); + return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); } static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) @@ -349,9 +372,10 @@ static int gcm_encrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); struct skcipher_walk walk; u8 iv[AES_BLOCK_SIZE]; - u8 ks[AES_BLOCK_SIZE]; + u8 ks[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -360,39 +384,39 @@ static int gcm_encrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { - kernel_neon_begin(); + err = skcipher_walk_aead_encrypt(&walk, req, false); - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; + + kernel_neon_begin(); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - pmull_gcm_encrypt_block(ks, iv, NULL, - num_rounds(&ctx->aes_key)); + pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); put_unaligned_be32(3, iv + GCM_IV_SIZE); - kernel_neon_end(); + pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); + put_unaligned_be32(4, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_encrypt(&walk, req, false); + do { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + if (rk) + kernel_neon_begin(); - kernel_neon_begin(); pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key), ks); + iv, rk, nrounds, ks); kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); - } + walk.nbytes % (2 * AES_BLOCK_SIZE)); + + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_encrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; @@ -400,8 +424,7 @@ static int gcm_encrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - ks, iv, - num_rounds(&ctx->aes_key)); + ks, iv, nrounds); crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -418,19 +441,28 @@ static int gcm_encrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { u8 buf[GHASH_BLOCK_SIZE]; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 *head = NULL; crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, walk.nbytes); - memcpy(buf, walk.dst.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = dst; + dst += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, dst, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); err = skcipher_walk_done(&walk, 0); } @@ -453,10 +485,11 @@ static int gcm_decrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); unsigned int authsize = crypto_aead_authsize(aead); struct skcipher_walk walk; - u8 iv[AES_BLOCK_SIZE]; + u8 iv[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; - u8 buf[GHASH_BLOCK_SIZE]; + u8 buf[2 * GHASH_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -465,43 +498,53 @@ static int gcm_decrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { - kernel_neon_begin(); + err = skcipher_walk_aead_decrypt(&walk, req, false); + + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + kernel_neon_begin(); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - kernel_neon_end(); - err = skcipher_walk_aead_decrypt(&walk, req, false); + do { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + int rem = walk.total - blocks * AES_BLOCK_SIZE; - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + if (rk) + kernel_neon_begin(); - kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + iv, rk, nrounds); + + /* check if this is the final iteration of the loop */ + if (rem < (2 * AES_BLOCK_SIZE)) { + u8 *iv2 = iv + AES_BLOCK_SIZE; + + if (rem > AES_BLOCK_SIZE) { + memcpy(iv2, iv, AES_BLOCK_SIZE); + crypto_inc(iv2, AES_BLOCK_SIZE); + } + + pmull_gcm_encrypt_block(iv, iv, NULL, nrounds); + + if (rem > AES_BLOCK_SIZE) + pmull_gcm_encrypt_block(iv2, iv2, NULL, + nrounds); + } + kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); - } - if (walk.nbytes) { - kernel_neon_begin(); - pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); - kernel_neon_end(); - } + walk.nbytes % (2 * AES_BLOCK_SIZE)); + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_decrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; @@ -512,8 +555,7 @@ static int gcm_decrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - buf, iv, - num_rounds(&ctx->aes_key)); + buf, iv, nrounds); crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -526,14 +568,24 @@ static int gcm_decrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { - memcpy(buf, walk.src.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + const u8 *src = walk.src.virt.addr; + const u8 *head = NULL; + unsigned int nbytes = walk.nbytes; + + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = src; + src += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, src, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, walk.nbytes); @@ -558,7 +610,7 @@ static int gcm_decrypt(struct aead_request *req) static struct aead_alg gcm_aes_alg = { .ivsize = GCM_IV_SIZE, - .chunksize = AES_BLOCK_SIZE, + .chunksize = 2 * AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, .setkey = gcm_setkey, .setauthsize = gcm_setauthsize, diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index efbeb3e0dcfb..17fac2889f56 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -99,7 +99,6 @@ static struct shash_alg alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index fd1ff2b13dfa..261f5195cab7 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -114,7 +114,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -129,7 +128,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index e8880ccdc71f..4aedeaefd61f 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -67,8 +67,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_priority = 125, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -80,8 +79,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_priority = 125, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; @@ -153,7 +151,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -166,7 +163,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index da8222e528bd..a336feac0f59 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -105,7 +105,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-224", .base.cra_driver_name = "sha3-224-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -117,7 +116,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-256", .base.cra_driver_name = "sha3-256-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -129,7 +127,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-384", .base.cra_driver_name = "sha3-384-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -141,7 +138,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-512", .base.cra_driver_name = "sha3-512-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index a77c8632a589..f2c5f28c622a 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -87,7 +87,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -100,7 +99,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 27db4851e380..325b23b43a9b 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -63,7 +63,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -76,7 +75,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 3b4948f7e26f..88938a20d9b2 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -72,7 +72,6 @@ static struct shash_alg sm3_alg = { .descsize = sizeof(struct sm3_state), .base.cra_name = "sm3", .base.cra_driver_name = "sm3-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, |