diff options
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-core.S | 150 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 3 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 319 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 202 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 1 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha256-glue.c | 8 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 4 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-glue.c | 2 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-glue.c | 1 |
11 files changed, 389 insertions, 305 deletions
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 88f5aef7934c..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -19,33 +19,24 @@ * u32 *macp, u8 const rk[], u32 rounds); */ ENTRY(ce_aes_ccm_auth_data) - frame_push 7 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - - ldr w25, [x22] /* leftover from prev round? */ + ldr w8, [x3] /* leftover from prev round? */ ld1 {v0.16b}, [x0] /* load mac */ - cbz w25, 1f - sub w25, w25, #16 + cbz w8, 1f + sub w8, w8, #16 eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x20], #1 /* get 1 byte of input */ - subs w21, w21, #1 - add w25, w25, #1 +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w8, w8, #1 ins v1.b[0], w7 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ beq 8f /* out of input? */ - cbnz w25, 0b + cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x23] /* load first round key */ - prfm pldl1strm, [x20] - cmp w24, #12 /* which key size? */ - add x6, x23, #16 - sub w7, w24, #2 /* modified # of rounds */ +1: ld1 {v3.4s}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ bmi 2f bne 5f mov v5.16b, v3.16b @@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data) ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b - subs w21, w21, #16 /* last data? */ + subs w2, w2, #16 /* last data? */ eor v0.16b, v0.16b, v5.16b /* final round */ bmi 6f - ld1 {v1.16b}, [x20], #16 /* load next input block */ + ld1 {v1.16b}, [x1], #16 /* load next input block */ eor v0.16b, v0.16b, v1.16b /* xor with mac */ - beq 6f - - if_will_cond_yield_neon - st1 {v0.16b}, [x19] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x19] /* reload mac */ - endif_yield_neon - - b 1b -6: st1 {v0.16b}, [x19] /* store mac */ + bne 1b +6: st1 {v0.16b}, [x0] /* store mac */ beq 10f - adds w21, w21, #16 + adds w2, w2, #16 beq 10f - mov w25, w21 -7: ldrb w7, [x20], #1 + mov w8, w2 +7: ldrb w7, [x1], #1 umov w6, v0.b[0] eor w6, w6, w7 - strb w6, [x19], #1 - subs w21, w21, #1 + strb w6, [x0], #1 + subs w2, w2, #1 beq 10f ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ b 7b -8: mov w7, w25 - add w25, w25, #16 +8: mov w7, w8 + add w8, w8, #16 9: ext v1.16b, v1.16b, v1.16b, #1 adds w7, w7, #1 bne 9b eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x19] -10: str w25, [x22] - - frame_pop + st1 {v0.16b}, [x0] +10: str w8, [x3] ret ENDPROC(ce_aes_ccm_auth_data) @@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc - frame_push 8 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - - ldr x26, [x25, #8] /* load lower ctr */ - ld1 {v0.16b}, [x24] /* load mac */ -CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ - ld1 {v1.8b}, [x25] /* load upper ctr */ - prfm pldl1strm, [x20] - add x26, x26, #1 - rev x9, x26 - cmp w23, #12 /* which key size? */ - sub w7, w23, #2 /* get modified # of rounds */ + ld1 {v1.8b}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x22] /* load first round key */ - add x10, x22, #16 + ld1 {v3.4s}, [x3] /* load first round key */ + add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b @@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b - subs w21, w21, #16 - bmi 7f /* partial block? */ - ld1 {v2.16b}, [x20], #16 /* load next input block */ + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ @@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ eor v1.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x19], #16 /* write output block */ - beq 5f - - if_will_cond_yield_neon - st1 {v0.16b}, [x24] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x24] /* reload mac */ - endif_yield_neon - - b 0b -5: -CPU_LE( rev x26, x26 ) - st1 {v0.16b}, [x24] /* store mac */ - str x26, [x25, #8] /* store lsb end of ctr (BE) */ - -6: frame_pop - ret - -7: eor v0.16b, v0.16b, v5.16b /* final round mac */ + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret + +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ eor v1.16b, v1.16b, v5.16b /* final round enc */ - st1 {v0.16b}, [x24] /* store mac */ - add w21, w21, #16 /* process partial tail block */ -8: ldrb w9, [x20], #1 /* get 1 byte of input */ + st1 {v0.16b}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ umov w6, v1.b[0] /* get top crypted ctr byte */ umov w7, v0.b[0] /* get top mac byte */ .if \enc == 1 @@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 ) eor w9, w9, w6 eor w7, w7, w9 .endif - strb w9, [x19], #1 /* store out byte */ - strb w7, [x24], #1 /* store mac byte */ - subs w21, w21, #1 - beq 6b + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 8b + b 7b .endm /* diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index e3e50950a863..adcb83eb683c 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -567,7 +567,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cmac(aes)", .base.cra_driver_name = "cmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -583,7 +582,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "xcbc(aes)", .base.cra_driver_name = "xcbc-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -599,7 +597,6 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cbcmac(aes)", .base.cra_driver_name = "cbcmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), .base.cra_module = THIS_MODULE, diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dcffb9e77589..1b319b716d5e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -46,6 +46,19 @@ ss3 .req v26 ss4 .req v27 + XL2 .req v8 + XM2 .req v9 + XH2 .req v10 + XL3 .req v11 + XM3 .req v12 + XH3 .req v13 + TT3 .req v14 + TT4 .req v15 + HH .req v16 + HH3 .req v17 + HH4 .req v18 + HH34 .req v19 + .text .arch armv8-a+crypto @@ -134,11 +147,25 @@ .endm .macro __pmull_pre_p64 + add x8, x3, #16 + ld1 {HH.2d-HH4.2d}, [x8] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + movi MASK.16b, #0xe1 shl MASK.2d, MASK.2d, #57 .endm .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff @@ -213,31 +240,88 @@ .endm .macro __pmull_ghash, pn - frame_push 5 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - -0: ld1 {SHASH.2d}, [x22] - ld1 {XL.2d}, [x20] - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - eor SHASH2.16b, SHASH2.16b, SHASH.16b + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] __pmull_pre_\pn /* do the head block first, if supplied */ - cbz x23, 1f - ld1 {T1.2d}, [x23] - mov x23, xzr - b 2f + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 3f + +0: .ifc \pn, p64 + tbnz w0, #0, 2f // skip until #blocks is a + tbnz w0, #1, 2f // round multiple of 4 + +1: ld1 {XM3.16b-TT4.16b}, [x2], #64 + + sub w0, w0, #4 + + rev64 T1.16b, XM3.16b + rev64 T2.16b, XH3.16b + rev64 TT4.16b, TT4.16b + rev64 TT3.16b, TT3.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 + ext XL3.16b, TT3.16b, TT3.16b, #8 + + eor TT4.16b, TT4.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor TT3.16b, TT3.16b, XL3.16b + pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 + pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 + pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) + + ext IN1.16b, T2.16b, T2.16b, #8 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + eor T2.16b, T2.16b, IN1.16b + pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 + pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 + pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) -1: ld1 {T1.2d}, [x21], #16 - sub w19, w19, #1 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b -2: /* multiply XL by SHASH in GF(2^128) */ + ext IN1.16b, T1.16b, T1.16b, #8 + ext TT3.16b, XL.16b, XL.16b, #8 + eor XL.16b, XL.16b, IN1.16b + eor T1.16b, T1.16b, TT3.16b + + pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, HH4.1d, XL.1d // a0 * b0 + pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbz w0, 5f + b 1b + .endif + +2: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 + +3: /* multiply XL by SHASH in GF(2^128) */ CPU_LE( rev64 T1.16b, T1.16b ) ext T2.16b, XL.16b, XL.16b, #8 @@ -250,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b ) __pmull_\pn XL, XL, SHASH // a0 * b0 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) - eor T2.16b, XL.16b, XH.16b +4: eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b @@ -259,18 +343,9 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b - cbz w19, 3f + cbnz w0, 0b - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b - -3: st1 {XL.2d}, [x20] - frame_pop +5: st1 {XL.2d}, [x1] ret .endm @@ -286,9 +361,10 @@ ENTRY(pmull_ghash_update_p8) __pmull_ghash p8 ENDPROC(pmull_ghash_update_p8) - KS .req v8 - CTR .req v9 - INP .req v10 + KS0 .req v12 + KS1 .req v13 + INP0 .req v14 + INP1 .req v15 .macro load_round_keys, rounds, rk cmp \rounds, #12 @@ -322,142 +398,153 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - frame_push 10 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - mov x26, x7 - .if \enc == 1 - ldr x27, [sp, #96] // first stacked arg - .endif - - ldr x28, [x24, #8] // load lower counter -CPU_LE( rev x28, x28 ) - -0: mov x0, x25 - load_round_keys w26, x0 - ld1 {SHASH.2d}, [x23] - ld1 {XL.2d}, [x20] + ld1 {SHASH.2d}, [x4], #16 + ld1 {HH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter movi MASK.16b, #0xe1 - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d +CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 - eor SHASH2.16b, SHASH2.16b, SHASH.16b + eor SHASH2.16b, SHASH2.16b, T1.16b .if \enc == 1 - ld1 {KS.16b}, [x27] + ldr x10, [sp] + ld1 {KS0.16b-KS1.16b}, [x10] .endif -1: ld1 {CTR.8b}, [x24] // load upper counter - ld1 {INP.16b}, [x22], #16 - rev x9, x28 - add x28, x28, #1 - sub w19, w19, #1 - ins CTR.d[1], x9 // set lower counter + cbnz x6, 4f + +0: ld1 {INP0.16b-INP1.16b}, [x3], #32 + + rev x9, x8 + add x11, x8, #1 + add x8, x8, #2 .if \enc == 1 - eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x21], #16 + eor INP0.16b, INP0.16b, KS0.16b // encrypt input + eor INP1.16b, INP1.16b, KS1.16b .endif - rev64 T1.16b, INP.16b + ld1 {KS0.8b}, [x5] // load upper counter + rev x11, x11 + sub w0, w0, #2 + mov KS1.8b, KS0.8b + ins KS0.d[1], x9 // set lower counter + ins KS1.d[1], x11 - cmp w26, #12 - b.ge 4f // AES-192/256? + rev64 T1.16b, INP1.16b -2: enc_round CTR, v21 + cmp w7, #12 + b.ge 2f // AES-192/256? - ext T2.16b, XL.16b, XL.16b, #8 +1: enc_round KS0, v21 ext IN1.16b, T1.16b, T1.16b, #8 - enc_round CTR, v22 + enc_round KS1, v21 + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + + enc_round KS0, v22 + eor T1.16b, T1.16b, IN1.16b + + enc_round KS1, v22 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + enc_round KS0, v23 + pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + + enc_round KS1, v23 + rev64 T1.16b, INP0.16b + ext T2.16b, XL.16b, XL.16b, #8 + + enc_round KS0, v24 + ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b - eor XL.16b, XL.16b, IN1.16b - enc_round CTR, v23 + enc_round KS1, v24 + eor XL.16b, XL.16b, IN1.16b - pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + enc_round KS0, v25 eor T1.16b, T1.16b, XL.16b - enc_round CTR, v24 + enc_round KS1, v25 + pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 + + enc_round KS0, v26 + pmull XL.1q, HH.1d, XL.1d // a0 * b0 - pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 - pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + enc_round KS1, v26 + pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) - enc_round CTR, v25 + enc_round KS0, v27 + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + enc_round KS1, v27 + eor XM.16b, XM.16b, XM2.16b ext T1.16b, XL.16b, XH.16b, #8 + + enc_round KS0, v28 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b - enc_round CTR, v26 - + enc_round KS1, v28 eor XM.16b, XM.16b, T2.16b - pmull T2.1q, XL.1d, MASK.1d - enc_round CTR, v27 + enc_round KS0, v29 + pmull T2.1q, XL.1d, MASK.1d + enc_round KS1, v29 mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] - enc_round CTR, v28 - + aese KS0.16b, v30.16b eor XL.16b, XM.16b, T2.16b - enc_round CTR, v29 - + aese KS1.16b, v30.16b ext T2.16b, XL.16b, XL.16b, #8 - aese CTR.16b, v30.16b - + eor KS0.16b, KS0.16b, v31.16b pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b - eor KS.16b, CTR.16b, v31.16b - + eor KS1.16b, KS1.16b, v31.16b eor XL.16b, XL.16b, T2.16b .if \enc == 0 - eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x21], #16 + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b .endif - cbz w19, 3f + st1 {INP0.16b-INP1.16b}, [x2], #32 - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - .if \enc == 1 - st1 {KS.16b}, [x27] - .endif - do_cond_yield_neon - b 0b - endif_yield_neon + cbnz w0, 0b - b 1b +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter -3: st1 {XL.2d}, [x20] .if \enc == 1 - st1 {KS.16b}, [x27] + st1 {KS0.16b-KS1.16b}, [x10] .endif -CPU_LE( rev x28, x28 ) - str x28, [x24, #8] // store lower counter - - frame_pop ret -4: b.eq 5f // AES-192? - enc_round CTR, v17 - enc_round CTR, v18 -5: enc_round CTR, v19 - enc_round CTR, v20 - b 2b +2: b.eq 3f // AES-192? + enc_round KS0, v17 + enc_round KS1, v17 + enc_round KS0, v18 + enc_round KS1, v18 +3: enc_round KS0, v19 + enc_round KS1, v19 + enc_round KS0, v20 + enc_round KS1, v20 + b 1b + +4: load_round_keys w7, x6 + b 0b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 7cf0b1aa6ea8..6e9f33d14930 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -33,9 +33,12 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GCM_IV_SIZE 12 struct ghash_key { - u64 a; - u64 b; - be128 k; + u64 h[2]; + u64 h2[2]; + u64 h3[2]; + u64 h4[2]; + + be128 k; }; struct ghash_desc_ctx { @@ -113,6 +116,9 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, } } +/* avoid hogging the CPU for too long */ +#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE) + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -136,11 +142,16 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - ghash_do_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); + do { + int chunk = min(blocks, MAX_BLOCKS); - src += blocks * GHASH_BLOCK_SIZE; - partial = 0; + ghash_do_update(chunk, ctx->digest, src, key, + partial ? ctx->buf : NULL); + + blocks -= chunk; + src += chunk * GHASH_BLOCK_SIZE; + partial = 0; + } while (unlikely(blocks > 0)); } if (len) memcpy(ctx->buf + partial, src, len); @@ -166,23 +177,36 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } +static void ghash_reflect(u64 h[], const be128 *k) +{ + u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0; + + h[0] = (be64_to_cpu(k->b) << 1) | carry; + h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63); + + if (carry) + h[1] ^= 0xc200000000000000UL; +} + static int __ghash_setkey(struct ghash_key *key, const u8 *inkey, unsigned int keylen) { - u64 a, b; + be128 h; /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); - /* perform multiplication by 'x' in GF(2^128) */ - b = get_unaligned_be64(inkey); - a = get_unaligned_be64(inkey + 8); + ghash_reflect(key->h, &key->k); - key->a = (a << 1) | (b >> 63); - key->b = (b << 1) | (a >> 63); + h = key->k; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h2, &h); - if (b >> 63) - key->b ^= 0xc200000000000000UL; + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h3, &h); + + gf128mul_lle(&h, &key->k); + ghash_reflect(key->h4, &h); return 0; } @@ -204,7 +228,6 @@ static struct shash_alg ghash_alg = { .base.cra_name = "ghash", .base.cra_driver_name = "ghash-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = GHASH_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct ghash_key), .base.cra_module = THIS_MODULE, @@ -245,7 +268,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, num_rounds(&ctx->aes_key)); - return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); + return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128)); } static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) @@ -349,9 +372,10 @@ static int gcm_encrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); struct skcipher_walk walk; u8 iv[AES_BLOCK_SIZE]; - u8 ks[AES_BLOCK_SIZE]; + u8 ks[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -360,39 +384,39 @@ static int gcm_encrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { - kernel_neon_begin(); + err = skcipher_walk_aead_encrypt(&walk, req, false); - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; + + kernel_neon_begin(); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - pmull_gcm_encrypt_block(ks, iv, NULL, - num_rounds(&ctx->aes_key)); + pmull_gcm_encrypt_block(ks, iv, NULL, nrounds); put_unaligned_be32(3, iv + GCM_IV_SIZE); - kernel_neon_end(); + pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds); + put_unaligned_be32(4, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_encrypt(&walk, req, false); + do { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + if (rk) + kernel_neon_begin(); - kernel_neon_begin(); pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key), ks); + iv, rk, nrounds, ks); kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); - } + walk.nbytes % (2 * AES_BLOCK_SIZE)); + + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_encrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; @@ -400,8 +424,7 @@ static int gcm_encrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - ks, iv, - num_rounds(&ctx->aes_key)); + ks, iv, nrounds); crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -418,19 +441,28 @@ static int gcm_encrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { u8 buf[GHASH_BLOCK_SIZE]; + unsigned int nbytes = walk.nbytes; + u8 *dst = walk.dst.virt.addr; + u8 *head = NULL; crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, walk.nbytes); - memcpy(buf, walk.dst.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = dst; + dst += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, dst, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); err = skcipher_walk_done(&walk, 0); } @@ -453,10 +485,11 @@ static int gcm_decrypt(struct aead_request *req) struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); unsigned int authsize = crypto_aead_authsize(aead); struct skcipher_walk walk; - u8 iv[AES_BLOCK_SIZE]; + u8 iv[2 * AES_BLOCK_SIZE]; u8 tag[AES_BLOCK_SIZE]; - u8 buf[GHASH_BLOCK_SIZE]; + u8 buf[2 * GHASH_BLOCK_SIZE]; u64 dg[2] = {}; + int nrounds = num_rounds(&ctx->aes_key); int err; if (req->assoclen) @@ -465,39 +498,53 @@ static int gcm_decrypt(struct aead_request *req) memcpy(iv, req->iv, GCM_IV_SIZE); put_unaligned_be32(1, iv + GCM_IV_SIZE); - if (likely(may_use_simd())) { - kernel_neon_begin(); + err = skcipher_walk_aead_decrypt(&walk, req, false); + + if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + u32 const *rk = NULL; - pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + kernel_neon_begin(); + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - kernel_neon_end(); - err = skcipher_walk_aead_decrypt(&walk, req, false); + do { + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; + int rem = walk.total - blocks * AES_BLOCK_SIZE; - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + if (rk) + kernel_neon_begin(); - kernel_neon_begin(); pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, walk.src.virt.addr, &ctx->ghash_key, - iv, ctx->aes_key.key_enc, - num_rounds(&ctx->aes_key)); + iv, rk, nrounds); + + /* check if this is the final iteration of the loop */ + if (rem < (2 * AES_BLOCK_SIZE)) { + u8 *iv2 = iv + AES_BLOCK_SIZE; + + if (rem > AES_BLOCK_SIZE) { + memcpy(iv2, iv, AES_BLOCK_SIZE); + crypto_inc(iv2, AES_BLOCK_SIZE); + } + + pmull_gcm_encrypt_block(iv, iv, NULL, nrounds); + + if (rem > AES_BLOCK_SIZE) + pmull_gcm_encrypt_block(iv2, iv2, NULL, + nrounds); + } + kernel_neon_end(); err = skcipher_walk_done(&walk, - walk.nbytes % AES_BLOCK_SIZE); - } - if (walk.nbytes) - pmull_gcm_encrypt_block(iv, iv, NULL, - num_rounds(&ctx->aes_key)); + walk.nbytes % (2 * AES_BLOCK_SIZE)); + + rk = ctx->aes_key.key_enc; + } while (walk.nbytes >= 2 * AES_BLOCK_SIZE); } else { - __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, - num_rounds(&ctx->aes_key)); + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, nrounds); put_unaligned_be32(2, iv + GCM_IV_SIZE); - err = skcipher_walk_aead_decrypt(&walk, req, false); - while (walk.nbytes >= AES_BLOCK_SIZE) { int blocks = walk.nbytes / AES_BLOCK_SIZE; u8 *dst = walk.dst.virt.addr; @@ -508,8 +555,7 @@ static int gcm_decrypt(struct aead_request *req) do { __aes_arm64_encrypt(ctx->aes_key.key_enc, - buf, iv, - num_rounds(&ctx->aes_key)); + buf, iv, nrounds); crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); crypto_inc(iv, AES_BLOCK_SIZE); @@ -522,14 +568,24 @@ static int gcm_decrypt(struct aead_request *req) } if (walk.nbytes) __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, - num_rounds(&ctx->aes_key)); + nrounds); } /* handle the tail */ if (walk.nbytes) { - memcpy(buf, walk.src.virt.addr, walk.nbytes); - memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); - ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + const u8 *src = walk.src.virt.addr; + const u8 *head = NULL; + unsigned int nbytes = walk.nbytes; + + if (walk.nbytes > GHASH_BLOCK_SIZE) { + head = src; + src += GHASH_BLOCK_SIZE; + nbytes %= GHASH_BLOCK_SIZE; + } + + memcpy(buf, src, nbytes); + memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes); + ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head); crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, walk.nbytes); @@ -554,7 +610,7 @@ static int gcm_decrypt(struct aead_request *req) static struct aead_alg gcm_aes_alg = { .ivsize = GCM_IV_SIZE, - .chunksize = AES_BLOCK_SIZE, + .chunksize = 2 * AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, .setkey = gcm_setkey, .setauthsize = gcm_setauthsize, diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index efbeb3e0dcfb..17fac2889f56 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -99,7 +99,6 @@ static struct shash_alg alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index fd1ff2b13dfa..261f5195cab7 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -114,7 +114,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -129,7 +128,6 @@ static struct shash_alg algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ce", .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index e8880ccdc71f..4aedeaefd61f 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -67,8 +67,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_priority = 125, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -80,8 +79,7 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha256_state), .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64", - .base.cra_priority = 100, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_priority = 125, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; @@ -153,7 +151,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -166,7 +163,6 @@ static struct shash_alg neon_algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-arm64-neon", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index da8222e528bd..a336feac0f59 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -105,7 +105,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-224", .base.cra_driver_name = "sha3-224-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -117,7 +116,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-256", .base.cra_driver_name = "sha3-256-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -129,7 +127,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-384", .base.cra_driver_name = "sha3-384-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -141,7 +138,6 @@ static struct shash_alg algs[] = { { .descsize = sizeof(struct sha3_state), .base.cra_name = "sha3-512", .base.cra_driver_name = "sha3-512-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA3_512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index a77c8632a589..f2c5f28c622a 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -87,7 +87,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -100,7 +99,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-ce", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 27db4851e380..325b23b43a9b 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -63,7 +63,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha512", .base.cra_driver_name = "sha512-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -76,7 +75,6 @@ static struct shash_alg algs[] = { { .base.cra_name = "sha384", .base.cra_driver_name = "sha384-arm64", .base.cra_priority = 150, - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SHA384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 3b4948f7e26f..88938a20d9b2 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -72,7 +72,6 @@ static struct shash_alg sm3_alg = { .descsize = sizeof(struct sm3_state), .base.cra_name = "sm3", .base.cra_driver_name = "sm3-ce", - .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, |