diff options
Diffstat (limited to 'arch/x86/crypto')
82 files changed, 8401 insertions, 7484 deletions
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore new file mode 100644 index 000000000000..30be0400a439 --- /dev/null +++ b/arch/x86/crypto/.gitignore @@ -0,0 +1 @@ +poly1305-x86_64-cryptogams.S diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 45734e1cf967..b69e00bf20b8 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -14,11 +14,9 @@ sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o -obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o -obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o @@ -38,17 +36,10 @@ obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o -obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o -obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o - -obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o -obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o - -obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o -obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -58,21 +49,18 @@ ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o endif # These modules require assembler to support AVX2. ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - - obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o endif -aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o -aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o @@ -82,13 +70,13 @@ chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o -aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o -aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o - -morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o -morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) +targets += poly1305-x86_64-cryptogams.S +endif ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ @@ -106,8 +94,6 @@ ifeq ($(avx2_supported),yes) chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o - nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif @@ -119,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o -poly1305-x86_64-y += poly1305-avx2-x86_64.o endif ifeq ($(sha1_ni_supported),yes) sha1-ssse3-y += sha1_ni_asm.o @@ -136,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 4434607e366d..51d46d93efbc 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -71,7 +71,7 @@ * %r8 * %r9 */ -__load_partial: +SYM_FUNC_START_LOCAL(__load_partial) xor %r9d, %r9d pxor MSG, MSG @@ -123,7 +123,7 @@ __load_partial: .Lld_partial_8: ret -ENDPROC(__load_partial) +SYM_FUNC_END(__load_partial) /* * __store_partial: internal ABI @@ -137,7 +137,7 @@ ENDPROC(__load_partial) * %r9 * %r10 */ -__store_partial: +SYM_FUNC_START_LOCAL(__store_partial) mov LEN, %r8 mov DST, %r9 @@ -181,12 +181,12 @@ __store_partial: .Lst_partial_1: ret -ENDPROC(__store_partial) +SYM_FUNC_END(__store_partial) /* * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); */ -ENTRY(crypto_aegis128_aesni_init) +SYM_FUNC_START(crypto_aegis128_aesni_init) FRAME_BEGIN /* load IV: */ @@ -226,13 +226,13 @@ ENTRY(crypto_aegis128_aesni_init) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_init) +SYM_FUNC_END(crypto_aegis128_aesni_init) /* * void crypto_aegis128_aesni_ad(void *state, unsigned int length, * const void *data); */ -ENTRY(crypto_aegis128_aesni_ad) +SYM_FUNC_START(crypto_aegis128_aesni_ad) FRAME_BEGIN cmp $0x10, LEN @@ -378,7 +378,7 @@ ENTRY(crypto_aegis128_aesni_ad) .Lad_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_ad) +SYM_FUNC_END(crypto_aegis128_aesni_ad) .macro encrypt_block a s0 s1 s2 s3 s4 i movdq\a (\i * 0x10)(SRC), MSG @@ -402,7 +402,7 @@ ENDPROC(crypto_aegis128_aesni_ad) * void crypto_aegis128_aesni_enc(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_enc) +SYM_FUNC_START(crypto_aegis128_aesni_enc) FRAME_BEGIN cmp $0x10, LEN @@ -493,13 +493,13 @@ ENTRY(crypto_aegis128_aesni_enc) .Lenc_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_enc) +SYM_FUNC_END(crypto_aegis128_aesni_enc) /* * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) FRAME_BEGIN /* load the state: */ @@ -533,7 +533,7 @@ ENTRY(crypto_aegis128_aesni_enc_tail) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) .macro decrypt_block a s0 s1 s2 s3 s4 i movdq\a (\i * 0x10)(SRC), MSG @@ -556,7 +556,7 @@ ENDPROC(crypto_aegis128_aesni_enc_tail) * void crypto_aegis128_aesni_dec(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_dec) +SYM_FUNC_START(crypto_aegis128_aesni_dec) FRAME_BEGIN cmp $0x10, LEN @@ -647,13 +647,13 @@ ENTRY(crypto_aegis128_aesni_dec) .Ldec_out: FRAME_END ret -ENDPROC(crypto_aegis128_aesni_dec) +SYM_FUNC_END(crypto_aegis128_aesni_dec) /* * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, * const void *src, void *dst); */ -ENTRY(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) FRAME_BEGIN /* load the state: */ @@ -697,13 +697,13 @@ ENTRY(crypto_aegis128_aesni_dec_tail) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) /* * void crypto_aegis128_aesni_final(void *state, void *tag_xor, * u64 assoclen, u64 cryptlen); */ -ENTRY(crypto_aegis128_aesni_final) +SYM_FUNC_START(crypto_aegis128_aesni_final) FRAME_BEGIN /* load the state: */ @@ -744,4 +744,4 @@ ENTRY(crypto_aegis128_aesni_final) FRAME_END ret -ENDPROC(crypto_aegis128_aesni_final) +SYM_FUNC_END(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 46d227122643..4623189000d8 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -144,10 +144,8 @@ static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, { struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); - if (keylen != AEGIS128_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; - } memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S deleted file mode 100644 index 1461ef00c0e8..000000000000 --- a/arch/x86/crypto/aegis128l-aesni-asm.S +++ /dev/null @@ -1,823 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AES-NI + SSE2 implementation of AEGIS-128L - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define STATE5 %xmm5 -#define STATE6 %xmm6 -#define STATE7 %xmm7 -#define MSG0 %xmm8 -#define MSG1 %xmm9 -#define T0 %xmm10 -#define T1 %xmm11 -#define T2 %xmm12 -#define T3 %xmm13 - -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - -.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32 -.align 16 -.Laegis128l_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Laegis128l_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16 -.align 16 -.Laegis128l_counter0: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -.Laegis128l_counter1: - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -/* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG0 - first message block - * MSG1 - second message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG0, MSG0 - pxor MSG1, MSG1 - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG0 - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG0 - movq (%r8), T0 - pxor T0, MSG0 - -.Lld_partial_8: - mov LEN, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - movdqa MSG0, MSG1 - movdqu (SRC), MSG0 - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - first message block - * T1 - second message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov LEN, %r8 - mov DST, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0, (%r9) - movdqa T1, T0 - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -.macro update - movdqa STATE7, T0 - aesenc STATE0, STATE7 - aesenc STATE1, STATE0 - aesenc STATE2, STATE1 - aesenc STATE3, STATE2 - aesenc STATE4, STATE3 - aesenc STATE5, STATE4 - aesenc STATE6, STATE5 - aesenc T0, STATE6 -.endm - -.macro update0 - update - pxor MSG0, STATE7 - pxor MSG1, STATE3 -.endm - -.macro update1 - update - pxor MSG0, STATE6 - pxor MSG1, STATE2 -.endm - -.macro update2 - update - pxor MSG0, STATE5 - pxor MSG1, STATE1 -.endm - -.macro update3 - update - pxor MSG0, STATE4 - pxor MSG1, STATE0 -.endm - -.macro update4 - update - pxor MSG0, STATE3 - pxor MSG1, STATE7 -.endm - -.macro update5 - update - pxor MSG0, STATE2 - pxor MSG1, STATE6 -.endm - -.macro update6 - update - pxor MSG0, STATE1 - pxor MSG1, STATE5 -.endm - -.macro update7 - update - pxor MSG0, STATE0 - pxor MSG1, STATE4 -.endm - -.macro state_load - movdqu 0x00(STATEP), STATE0 - movdqu 0x10(STATEP), STATE1 - movdqu 0x20(STATEP), STATE2 - movdqu 0x30(STATEP), STATE3 - movdqu 0x40(STATEP), STATE4 - movdqu 0x50(STATEP), STATE5 - movdqu 0x60(STATEP), STATE6 - movdqu 0x70(STATEP), STATE7 -.endm - -.macro state_store s0 s1 s2 s3 s4 s5 s6 s7 - movdqu \s7, 0x00(STATEP) - movdqu \s0, 0x10(STATEP) - movdqu \s1, 0x20(STATEP) - movdqu \s2, 0x30(STATEP) - movdqu \s3, 0x40(STATEP) - movdqu \s4, 0x50(STATEP) - movdqu \s5, 0x60(STATEP) - movdqu \s6, 0x70(STATEP) -.endm - -.macro state_store0 - state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 -.endm - -.macro state_store1 - state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 -.endm - -.macro state_store2 - state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro state_store3 - state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro state_store4 - state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro state_store5 - state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 -.endm - -.macro state_store6 - state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 -.endm - -.macro state_store7 - state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 -.endm - -/* - * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_aegis128l_aesni_init) - FRAME_BEGIN - - /* load key: */ - movdqa (%rsi), MSG1 - movdqa MSG1, STATE0 - movdqa MSG1, STATE4 - movdqa MSG1, STATE5 - movdqa MSG1, STATE6 - movdqa MSG1, STATE7 - - /* load IV: */ - movdqu (%rdx), MSG0 - pxor MSG0, STATE0 - pxor MSG0, STATE4 - - /* load the constants: */ - movdqa .Laegis128l_const_0, STATE2 - movdqa .Laegis128l_const_1, STATE1 - movdqa STATE1, STATE3 - pxor STATE2, STATE5 - pxor STATE1, STATE6 - pxor STATE2, STATE7 - - /* update 10 times with IV and KEY: */ - update0 - update1 - update2 - update3 - update4 - update5 - update6 - update7 - update0 - update1 - - state_store1 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_init) - -.macro ad_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - update\i - sub $0x20, LEN - cmp $0x20, LEN - jl .Lad_out_\i -.endm - -/* - * void crypto_aegis128l_aesni_ad(void *state, unsigned int length, - * const void *data); - */ -ENTRY(crypto_aegis128l_aesni_ad) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Lad_out - - state_load - - mov SRC, %r8 - and $0xf, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - ad_block a 0 - ad_block a 1 - ad_block a 2 - ad_block a 3 - ad_block a 4 - ad_block a 5 - ad_block a 6 - ad_block a 7 - - add $0x100, SRC - jmp .Lad_a_loop - -.align 8 -.Lad_u_loop: - ad_block u 0 - ad_block u 1 - ad_block u 2 - ad_block u 3 - ad_block u 4 - ad_block u 5 - ad_block u 6 - ad_block u 7 - - add $0x100, SRC - jmp .Lad_u_loop - -.Lad_out_0: - state_store0 - FRAME_END - ret - -.Lad_out_1: - state_store1 - FRAME_END - ret - -.Lad_out_2: - state_store2 - FRAME_END - ret - -.Lad_out_3: - state_store3 - FRAME_END - ret - -.Lad_out_4: - state_store4 - FRAME_END - ret - -.Lad_out_5: - state_store5 - FRAME_END - ret - -.Lad_out_6: - state_store6 - FRAME_END - ret - -.Lad_out_7: - state_store7 - FRAME_END - ret - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_ad) - -.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7 - pxor \s1, \m0 - pxor \s6, \m0 - movdqa \s2, T3 - pand \s3, T3 - pxor T3, \m0 - - pxor \s2, \m1 - pxor \s5, \m1 - movdqa \s6, T3 - pand \s7, T3 - pxor T3, \m1 -.endm - -.macro crypt0 m0 m1 - crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 -.endm - -.macro crypt1 m0 m1 - crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 -.endm - -.macro crypt2 m0 m1 - crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro crypt3 m0 m1 - crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro crypt4 m0 m1 - crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro crypt5 m0 m1 - crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 -.endm - -.macro crypt6 m0 m1 - crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 -.endm - -.macro crypt7 m0 m1 - crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 -.endm - -.macro encrypt_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - movdqa MSG0, T0 - movdqa MSG1, T1 - crypt\i T0, T1 - movdq\a T0, (\i * 0x20 + 0x00)(DST) - movdq\a T1, (\i * 0x20 + 0x10)(DST) - - update\i - - sub $0x20, LEN - cmp $0x20, LEN - jl .Lenc_out_\i -.endm - -.macro decrypt_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - crypt\i MSG0, MSG1 - movdq\a MSG0, (\i * 0x20 + 0x00)(DST) - movdq\a MSG1, (\i * 0x20 + 0x10)(DST) - - update\i - - sub $0x20, LEN - cmp $0x20, LEN - jl .Ldec_out_\i -.endm - -/* - * void crypto_aegis128l_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_enc) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Lenc_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xf, %r8 - jnz .Lenc_u_loop - -.align 8 -.Lenc_a_loop: - encrypt_block a 0 - encrypt_block a 1 - encrypt_block a 2 - encrypt_block a 3 - encrypt_block a 4 - encrypt_block a 5 - encrypt_block a 6 - encrypt_block a 7 - - add $0x100, SRC - add $0x100, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u 0 - encrypt_block u 1 - encrypt_block u 2 - encrypt_block u 3 - encrypt_block u 4 - encrypt_block u 5 - encrypt_block u 6 - encrypt_block u 7 - - add $0x100, SRC - add $0x100, DST - jmp .Lenc_u_loop - -.Lenc_out_0: - state_store0 - FRAME_END - ret - -.Lenc_out_1: - state_store1 - FRAME_END - ret - -.Lenc_out_2: - state_store2 - FRAME_END - ret - -.Lenc_out_3: - state_store3 - FRAME_END - ret - -.Lenc_out_4: - state_store4 - FRAME_END - ret - -.Lenc_out_5: - state_store5 - FRAME_END - ret - -.Lenc_out_6: - state_store6 - FRAME_END - ret - -.Lenc_out_7: - state_store7 - FRAME_END - ret - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_enc) - -/* - * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_enc_tail) - FRAME_BEGIN - - state_load - - /* encrypt message: */ - call __load_partial - - movdqa MSG0, T0 - movdqa MSG1, T1 - crypt0 T0, T1 - - call __store_partial - - update0 - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_enc_tail) - -/* - * void crypto_aegis128l_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_dec) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Ldec_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 8 -.Ldec_a_loop: - decrypt_block a 0 - decrypt_block a 1 - decrypt_block a 2 - decrypt_block a 3 - decrypt_block a 4 - decrypt_block a 5 - decrypt_block a 6 - decrypt_block a 7 - - add $0x100, SRC - add $0x100, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u 0 - decrypt_block u 1 - decrypt_block u 2 - decrypt_block u 3 - decrypt_block u 4 - decrypt_block u 5 - decrypt_block u 6 - decrypt_block u 7 - - add $0x100, SRC - add $0x100, DST - jmp .Ldec_u_loop - -.Ldec_out_0: - state_store0 - FRAME_END - ret - -.Ldec_out_1: - state_store1 - FRAME_END - ret - -.Ldec_out_2: - state_store2 - FRAME_END - ret - -.Ldec_out_3: - state_store3 - FRAME_END - ret - -.Ldec_out_4: - state_store4 - FRAME_END - ret - -.Ldec_out_5: - state_store5 - FRAME_END - ret - -.Ldec_out_6: - state_store6 - FRAME_END - ret - -.Ldec_out_7: - state_store7 - FRAME_END - ret - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_dec) - -/* - * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_dec_tail) - FRAME_BEGIN - - state_load - - /* decrypt message: */ - call __load_partial - - crypt0 MSG0, MSG1 - - movdqa MSG0, T0 - movdqa MSG1, T1 - call __store_partial - - /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa T0, T1 - movdqa .Laegis128l_counter0, T2 - movdqa .Laegis128l_counter1, T3 - pcmpgtb T2, T0 - pcmpgtb T3, T1 - pand T0, MSG0 - pand T1, MSG1 - - update0 - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_dec_tail) - -/* - * void crypto_aegis128l_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_aegis128l_aesni_final) - FRAME_BEGIN - - state_load - - /* prepare length block: */ - movq %rdx, MSG0 - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG0 - psllq $3, MSG0 /* multiply by 8 (to get bit count) */ - - pxor STATE2, MSG0 - movdqa MSG0, MSG1 - - /* update state: */ - update0 - update1 - update2 - update3 - update4 - update5 - update6 - - /* xor tag: */ - movdqu (%rsi), T0 - - pxor STATE1, T0 - pxor STATE2, T0 - pxor STATE3, T0 - pxor STATE4, T0 - pxor STATE5, T0 - pxor STATE6, T0 - pxor STATE7, T0 - - movdqu T0, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_final) diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c deleted file mode 100644 index 19eb28b316f0..000000000000 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The AEGIS-128L Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -#define AEGIS128L_BLOCK_ALIGN 16 -#define AEGIS128L_BLOCK_SIZE 32 -#define AEGIS128L_NONCE_SIZE 16 -#define AEGIS128L_STATE_BLOCKS 8 -#define AEGIS128L_KEY_SIZE 16 -#define AEGIS128L_MIN_AUTH_SIZE 8 -#define AEGIS128L_MAX_AUTH_SIZE 16 - -asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128l_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128l_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - -struct aegis_block { - u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN); -}; - -struct aegis_state { - struct aegis_block blocks[AEGIS128L_STATE_BLOCKS]; -}; - -struct aegis_ctx { - struct aegis_block key; -}; - -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; - -static void crypto_aegis128l_aesni_process_ad( - struct aegis_state *state, struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct aegis_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= AEGIS128L_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = AEGIS128L_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - crypto_aegis128l_aesni_ad(state, - AEGIS128L_BLOCK_SIZE, - buf.bytes); - pos = 0; - left -= fill; - src += fill; - } - - crypto_aegis128l_aesni_ad(state, left, src); - - src += left & ~(AEGIS128L_BLOCK_SIZE - 1); - left &= AEGIS128L_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - pos += left; - assoclen -= size; - - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos); - crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes); - } -} - -static void crypto_aegis128l_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) -{ - while (walk->nbytes >= AEGIS128L_BLOCK_SIZE) { - ops->crypt_blocks(state, round_down(walk->nbytes, - AEGIS128L_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); - skcipher_walk_done(walk, walk->nbytes % AEGIS128L_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); - skcipher_walk_done(walk, 0); - } -} - -static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead) -{ - u8 *ctx = crypto_aead_ctx(aead); - ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); - return (void *)ctx; -} - -static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead); - - if (keylen != AEGIS128L_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE); - - return 0; -} - -static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - if (authsize > AEGIS128L_MAX_AUTH_SIZE) - return -EINVAL; - if (authsize < AEGIS128L_MIN_AUTH_SIZE) - return -EINVAL; - return 0; -} - -static void crypto_aegis128l_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm); - struct skcipher_walk walk; - struct aegis_state state; - - ops->skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv); - crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128l_aesni_process_crypt(&state, &walk, ops); - crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -static int crypto_aegis128l_aesni_encrypt(struct aead_request *req) -{ - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128l_aesni_enc, - .crypt_tail = crypto_aegis128l_aesni_enc_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_aegis128l_aesni_decrypt(struct aead_request *req) -{ - static const struct aegis_block zeros = {}; - - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128l_aesni_dec, - .crypt_tail = crypto_aegis128l_aesni_dec_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); - - return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; -} - -static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - -static struct aead_alg crypto_aegis128l_aesni_alg = { - .setkey = crypto_aegis128l_aesni_setkey, - .setauthsize = crypto_aegis128l_aesni_setauthsize, - .encrypt = crypto_aegis128l_aesni_encrypt, - .decrypt = crypto_aegis128l_aesni_decrypt, - .init = crypto_aegis128l_aesni_init_tfm, - .exit = crypto_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - .cra_priority = 400, - - .cra_name = "__aegis128l", - .cra_driver_name = "__aegis128l-aesni", - - .cra_module = THIS_MODULE, - } -}; - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_aegis128l_aesni_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_AES) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_aegis128l_aesni_alg, 1, - &simd_alg); -} - -static void __exit crypto_aegis128l_aesni_module_exit(void) -{ - simd_unregister_aeads(&crypto_aegis128l_aesni_alg, 1, &simd_alg); -} - -module_init(crypto_aegis128l_aesni_module_init); -module_exit(crypto_aegis128l_aesni_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation"); -MODULE_ALIAS_CRYPTO("aegis128l"); -MODULE_ALIAS_CRYPTO("aegis128l-aesni"); diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S deleted file mode 100644 index 37d9b13dfd85..000000000000 --- a/arch/x86/crypto/aegis256-aesni-asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AES-NI + SSE2 implementation of AEGIS-128L - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define STATE5 %xmm5 -#define MSG %xmm6 -#define T0 %xmm7 -#define T1 %xmm8 -#define T2 %xmm9 -#define T3 %xmm10 - -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - -.section .rodata.cst16.aegis256_const, "aM", @progbits, 32 -.align 16 -.Laegis256_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Laegis256_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16 -.align 16 -.Laegis256_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - -.text - -/* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov LEN, %r8 - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -.macro update - movdqa STATE5, T0 - aesenc STATE0, STATE5 - aesenc STATE1, STATE0 - aesenc STATE2, STATE1 - aesenc STATE3, STATE2 - aesenc STATE4, STATE3 - aesenc T0, STATE4 -.endm - -.macro update0 m - update - pxor \m, STATE5 -.endm - -.macro update1 m - update - pxor \m, STATE4 -.endm - -.macro update2 m - update - pxor \m, STATE3 -.endm - -.macro update3 m - update - pxor \m, STATE2 -.endm - -.macro update4 m - update - pxor \m, STATE1 -.endm - -.macro update5 m - update - pxor \m, STATE0 -.endm - -.macro state_load - movdqu 0x00(STATEP), STATE0 - movdqu 0x10(STATEP), STATE1 - movdqu 0x20(STATEP), STATE2 - movdqu 0x30(STATEP), STATE3 - movdqu 0x40(STATEP), STATE4 - movdqu 0x50(STATEP), STATE5 -.endm - -.macro state_store s0 s1 s2 s3 s4 s5 - movdqu \s5, 0x00(STATEP) - movdqu \s0, 0x10(STATEP) - movdqu \s1, 0x20(STATEP) - movdqu \s2, 0x30(STATEP) - movdqu \s3, 0x40(STATEP) - movdqu \s4, 0x50(STATEP) -.endm - -.macro state_store0 - state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro state_store1 - state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro state_store2 - state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro state_store3 - state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 -.endm - -.macro state_store4 - state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 -.endm - -.macro state_store5 - state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 -.endm - -/* - * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_aegis256_aesni_init) - FRAME_BEGIN - - /* load key: */ - movdqa 0x00(%rsi), MSG - movdqa 0x10(%rsi), T1 - movdqa MSG, STATE4 - movdqa T1, STATE5 - - /* load IV: */ - movdqu 0x00(%rdx), T2 - movdqu 0x10(%rdx), T3 - pxor MSG, T2 - pxor T1, T3 - movdqa T2, STATE0 - movdqa T3, STATE1 - - /* load the constants: */ - movdqa .Laegis256_const_0, STATE3 - movdqa .Laegis256_const_1, STATE2 - pxor STATE3, STATE4 - pxor STATE2, STATE5 - - /* update 10 times with IV and KEY: */ - update0 MSG - update1 T1 - update2 T2 - update3 T3 - update4 MSG - update5 T1 - update0 T2 - update1 T3 - update2 MSG - update3 T1 - update4 T2 - update5 T3 - update0 MSG - update1 T1 - update2 T2 - update3 T3 - - state_store3 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_init) - -.macro ad_block a i - movdq\a (\i * 0x10)(SRC), MSG - update\i MSG - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_\i -.endm - -/* - * void crypto_aegis256_aesni_ad(void *state, unsigned int length, - * const void *data); - */ -ENTRY(crypto_aegis256_aesni_ad) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lad_out - - state_load - - mov SRC, %r8 - and $0xf, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - ad_block a 0 - ad_block a 1 - ad_block a 2 - ad_block a 3 - ad_block a 4 - ad_block a 5 - - add $0x60, SRC - jmp .Lad_a_loop - -.align 8 -.Lad_u_loop: - ad_block u 0 - ad_block u 1 - ad_block u 2 - ad_block u 3 - ad_block u 4 - ad_block u 5 - - add $0x60, SRC - jmp .Lad_u_loop - -.Lad_out_0: - state_store0 - FRAME_END - ret - -.Lad_out_1: - state_store1 - FRAME_END - ret - -.Lad_out_2: - state_store2 - FRAME_END - ret - -.Lad_out_3: - state_store3 - FRAME_END - ret - -.Lad_out_4: - state_store4 - FRAME_END - ret - -.Lad_out_5: - state_store5 - FRAME_END - ret - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_ad) - -.macro crypt m s0 s1 s2 s3 s4 s5 - pxor \s1, \m - pxor \s4, \m - pxor \s5, \m - movdqa \s2, T3 - pand \s3, T3 - pxor T3, \m -.endm - -.macro crypt0 m - crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro crypt1 m - crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro crypt2 m - crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro crypt3 m - crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 -.endm - -.macro crypt4 m - crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 -.endm - -.macro crypt5 m - crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 -.endm - -.macro encrypt_block a i - movdq\a (\i * 0x10)(SRC), MSG - movdqa MSG, T0 - crypt\i T0 - movdq\a T0, (\i * 0x10)(DST) - - update\i MSG - - sub $0x10, LEN - cmp $0x10, LEN - jl .Lenc_out_\i -.endm - -.macro decrypt_block a i - movdq\a (\i * 0x10)(SRC), MSG - crypt\i MSG - movdq\a MSG, (\i * 0x10)(DST) - - update\i MSG - - sub $0x10, LEN - cmp $0x10, LEN - jl .Ldec_out_\i -.endm - -/* - * void crypto_aegis256_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_enc) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lenc_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xf, %r8 - jnz .Lenc_u_loop - -.align 8 -.Lenc_a_loop: - encrypt_block a 0 - encrypt_block a 1 - encrypt_block a 2 - encrypt_block a 3 - encrypt_block a 4 - encrypt_block a 5 - - add $0x60, SRC - add $0x60, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u 0 - encrypt_block u 1 - encrypt_block u 2 - encrypt_block u 3 - encrypt_block u 4 - encrypt_block u 5 - - add $0x60, SRC - add $0x60, DST - jmp .Lenc_u_loop - -.Lenc_out_0: - state_store0 - FRAME_END - ret - -.Lenc_out_1: - state_store1 - FRAME_END - ret - -.Lenc_out_2: - state_store2 - FRAME_END - ret - -.Lenc_out_3: - state_store3 - FRAME_END - ret - -.Lenc_out_4: - state_store4 - FRAME_END - ret - -.Lenc_out_5: - state_store5 - FRAME_END - ret - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_enc) - -/* - * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_enc_tail) - FRAME_BEGIN - - state_load - - /* encrypt message: */ - call __load_partial - - movdqa MSG, T0 - crypt0 T0 - - call __store_partial - - update0 MSG - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_enc_tail) - -/* - * void crypto_aegis256_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_dec) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Ldec_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 8 -.Ldec_a_loop: - decrypt_block a 0 - decrypt_block a 1 - decrypt_block a 2 - decrypt_block a 3 - decrypt_block a 4 - decrypt_block a 5 - - add $0x60, SRC - add $0x60, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u 0 - decrypt_block u 1 - decrypt_block u 2 - decrypt_block u 3 - decrypt_block u 4 - decrypt_block u 5 - - add $0x60, SRC - add $0x60, DST - jmp .Ldec_u_loop - -.Ldec_out_0: - state_store0 - FRAME_END - ret - -.Ldec_out_1: - state_store1 - FRAME_END - ret - -.Ldec_out_2: - state_store2 - FRAME_END - ret - -.Ldec_out_3: - state_store3 - FRAME_END - ret - -.Ldec_out_4: - state_store4 - FRAME_END - ret - -.Ldec_out_5: - state_store5 - FRAME_END - ret - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_dec) - -/* - * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_dec_tail) - FRAME_BEGIN - - state_load - - /* decrypt message: */ - call __load_partial - - crypt0 MSG - - movdqa MSG, T0 - call __store_partial - - /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis256_counter, T1 - pcmpgtb T1, T0 - pand T0, MSG - - update0 MSG - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_dec_tail) - -/* - * void crypto_aegis256_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_aegis256_aesni_final) - FRAME_BEGIN - - state_load - - /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG - psllq $3, MSG /* multiply by 8 (to get bit count) */ - - pxor STATE3, MSG - - /* update state: */ - update0 MSG - update1 MSG - update2 MSG - update3 MSG - update4 MSG - update5 MSG - update0 MSG - - /* xor tag: */ - movdqu (%rsi), MSG - - pxor STATE0, MSG - pxor STATE1, MSG - pxor STATE2, MSG - pxor STATE3, MSG - pxor STATE4, MSG - pxor STATE5, MSG - - movdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_final) diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c deleted file mode 100644 index f84da27171d3..000000000000 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The AEGIS-256 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -#define AEGIS256_BLOCK_ALIGN 16 -#define AEGIS256_BLOCK_SIZE 16 -#define AEGIS256_NONCE_SIZE 32 -#define AEGIS256_STATE_BLOCKS 6 -#define AEGIS256_KEY_SIZE 32 -#define AEGIS256_MIN_AUTH_SIZE 8 -#define AEGIS256_MAX_AUTH_SIZE 16 - -asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis256_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis256_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - -struct aegis_block { - u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN); -}; - -struct aegis_state { - struct aegis_block blocks[AEGIS256_STATE_BLOCKS]; -}; - -struct aegis_ctx { - struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE]; -}; - -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; - -static void crypto_aegis256_aesni_process_ad( - struct aegis_state *state, struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct aegis_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= AEGIS256_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = AEGIS256_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - crypto_aegis256_aesni_ad(state, - AEGIS256_BLOCK_SIZE, - buf.bytes); - pos = 0; - left -= fill; - src += fill; - } - - crypto_aegis256_aesni_ad(state, left, src); - - src += left & ~(AEGIS256_BLOCK_SIZE - 1); - left &= AEGIS256_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - pos += left; - assoclen -= size; - - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos); - crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes); - } -} - -static void crypto_aegis256_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) -{ - while (walk->nbytes >= AEGIS256_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS256_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); - skcipher_walk_done(walk, walk->nbytes % AEGIS256_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); - skcipher_walk_done(walk, 0); - } -} - -static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead) -{ - u8 *ctx = crypto_aead_ctx(aead); - ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); - return (void *)ctx; -} - -static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead); - - if (keylen != AEGIS256_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key, key, AEGIS256_KEY_SIZE); - - return 0; -} - -static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - if (authsize > AEGIS256_MAX_AUTH_SIZE) - return -EINVAL; - if (authsize < AEGIS256_MIN_AUTH_SIZE) - return -EINVAL; - return 0; -} - -static void crypto_aegis256_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm); - struct skcipher_walk walk; - struct aegis_state state; - - ops->skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - crypto_aegis256_aesni_init(&state, ctx->key, req->iv); - crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis256_aesni_process_crypt(&state, &walk, ops); - crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -static int crypto_aegis256_aesni_encrypt(struct aead_request *req) -{ - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis256_aesni_enc, - .crypt_tail = crypto_aegis256_aesni_enc_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_aegis256_aesni_decrypt(struct aead_request *req) -{ - static const struct aegis_block zeros = {}; - - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis256_aesni_dec, - .crypt_tail = crypto_aegis256_aesni_dec_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); - - return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; -} - -static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - -static struct aead_alg crypto_aegis256_aesni_alg = { - .setkey = crypto_aegis256_aesni_setkey, - .setauthsize = crypto_aegis256_aesni_setauthsize, - .encrypt = crypto_aegis256_aesni_encrypt, - .decrypt = crypto_aegis256_aesni_decrypt, - .init = crypto_aegis256_aesni_init_tfm, - .exit = crypto_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - .cra_priority = 400, - - .cra_name = "__aegis256", - .cra_driver_name = "__aegis256-aesni", - - .cra_module = THIS_MODULE, - } -}; - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_aegis256_aesni_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_AES) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_aegis256_aesni_alg, 1, - &simd_alg); -} - -static void __exit crypto_aegis256_aesni_module_exit(void) -{ - simd_unregister_aeads(&crypto_aegis256_aesni_alg, 1, &simd_alg); -} - -module_init(crypto_aegis256_aesni_module_init); -module_exit(crypto_aegis256_aesni_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation"); -MODULE_ALIAS_CRYPTO("aegis256"); -MODULE_ALIAS_CRYPTO("aegis256-aesni"); diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S deleted file mode 100644 index 2849dbc59e11..000000000000 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ /dev/null @@ -1,362 +0,0 @@ -// ------------------------------------------------------------------------- -// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK. -// All rights reserved. -// -// LICENSE TERMS -// -// The free distribution and use of this software in both source and binary -// form is allowed (with or without changes) provided that: -// -// 1. distributions of this source code include the above copyright -// notice, this list of conditions and the following disclaimer// -// -// 2. distributions in binary form include the above copyright -// notice, this list of conditions and the following disclaimer -// in the documentation and/or other associated materials// -// -// 3. the copyright holder's name is not used to endorse products -// built using this software without specific written permission. -// -// -// ALTERNATIVELY, provided that this notice is retained in full, this product -// may be distributed under the terms of the GNU General Public License (GPL), -// in which case the provisions of the GPL apply INSTEAD OF those given above. -// -// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org> -// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> - -// DISCLAIMER -// -// This software is provided 'as is' with no explicit or implied warranties -// in respect of its properties including, but not limited to, correctness -// and fitness for purpose. -// ------------------------------------------------------------------------- -// Issue Date: 29/07/2002 - -.file "aes-i586-asm.S" -.text - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) - -/* offsets to parameters with one register pushed onto stack */ -#define ctx 8 -#define out_blk 12 -#define in_blk 16 - -/* offsets in crypto_aes_ctx structure */ -#define klen (480) -#define ekey (0) -#define dkey (240) - -// register mapping for encrypt and decrypt subroutines - -#define r0 eax -#define r1 ebx -#define r2 ecx -#define r3 edx -#define r4 esi -#define r5 edi - -#define eaxl al -#define eaxh ah -#define ebxl bl -#define ebxh bh -#define ecxl cl -#define ecxh ch -#define edxl dl -#define edxh dh - -#define _h(reg) reg##h -#define h(reg) _h(reg) - -#define _l(reg) reg##l -#define l(reg) _l(reg) - -// This macro takes a 32-bit word representing a column and uses -// each of its four bytes to index into four tables of 256 32-bit -// words to obtain values that are then xored into the appropriate -// output registers r0, r1, r4 or r5. - -// Parameters: -// table table base address -// %1 out_state[0] -// %2 out_state[1] -// %3 out_state[2] -// %4 out_state[3] -// idx input register for the round (destroyed) -// tmp scratch register for the round -// sched key schedule - -#define do_col(table, a1,a2,a3,a4, idx, tmp) \ - movzx %l(idx),%tmp; \ - xor table(,%tmp,4),%a1; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+2*tlen(,%tmp,4),%a3; \ - xor table+3*tlen(,%idx,4),%a4; - -// initialise output registers from the key schedule -// NB1: original value of a3 is in idx on exit -// NB2: original values of a1,a2,a4 aren't used -#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \ - mov 0 sched,%a1; \ - movzx %l(idx),%tmp; \ - mov 12 sched,%a2; \ - xor table(,%tmp,4),%a1; \ - mov 4 sched,%a4; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+3*tlen(,%idx,4),%a4; \ - mov %a3,%idx; \ - mov 8 sched,%a3; \ - xor table+2*tlen(,%tmp,4),%a3; - -// initialise output registers from the key schedule -// NB1: original value of a3 is in idx on exit -// NB2: original values of a1,a2,a4 aren't used -#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \ - mov 0 sched,%a1; \ - movzx %l(idx),%tmp; \ - mov 4 sched,%a2; \ - xor table(,%tmp,4),%a1; \ - mov 12 sched,%a4; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+3*tlen(,%idx,4),%a4; \ - mov %a3,%idx; \ - mov 8 sched,%a3; \ - xor table+2*tlen(,%tmp,4),%a3; - - -// original Gladman had conditional saves to MMX regs. -#define save(a1, a2) \ - mov %a2,4*a1(%esp) - -#define restore(a1, a2) \ - mov 4*a2(%esp),%a1 - -// These macros perform a forward encryption cycle. They are entered with -// the first previous round column values in r0,r1,r4,r5 and -// exit with the final values in the same registers, using stack -// for temporary storage. - -// round column values -// on entry: r0,r1,r4,r5 -// on exit: r2,r1,r4,r5 -#define fwd_rnd1(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \ - do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \ - restore(r0,0); \ - do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \ - restore(r0,1); \ - do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */ - -// round column values -// on entry: r2,r1,r4,r5 -// on exit: r0,r1,r4,r5 -#define fwd_rnd2(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \ - do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \ - restore(r2,0); \ - do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \ - restore(r2,1); \ - do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */ - -// These macros performs an inverse encryption cycle. They are entered with -// the first previous round column values in r0,r1,r4,r5 and -// exit with the final values in the same registers, using stack -// for temporary storage - -// round column values -// on entry: r0,r1,r4,r5 -// on exit: r2,r1,r4,r5 -#define inv_rnd1(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \ - do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \ - restore(r0,0); \ - do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \ - restore(r0,1); \ - do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */ - -// round column values -// on entry: r2,r1,r4,r5 -// on exit: r0,r1,r4,r5 -#define inv_rnd2(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \ - do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \ - restore(r2,0); \ - do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \ - restore(r2,1); \ - do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */ - -// AES (Rijndael) Encryption Subroutine -/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ - -.extern crypto_ft_tab -.extern crypto_fl_tab - -ENTRY(aes_enc_blk) - push %ebp - mov ctx(%esp),%ebp - -// CAUTION: the order and the values used in these assigns -// rely on the register mappings - -1: push %ebx - mov in_blk+4(%esp),%r2 - push %esi - mov klen(%ebp),%r3 // key size - push %edi -#if ekey != 0 - lea ekey(%ebp),%ebp // key pointer -#endif - -// input four columns and xor in first round key - - mov (%r2),%r0 - mov 4(%r2),%r1 - mov 8(%r2),%r4 - mov 12(%r2),%r5 - xor (%ebp),%r0 - xor 4(%ebp),%r1 - xor 8(%ebp),%r4 - xor 12(%ebp),%r5 - - sub $8,%esp // space for register saves on stack - add $16,%ebp // increment to next round key - cmp $24,%r3 - jb 4f // 10 rounds for 128-bit key - lea 32(%ebp),%ebp - je 3f // 12 rounds for 192-bit key - lea 32(%ebp),%ebp - -2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key - fwd_rnd2( -48(%ebp), crypto_ft_tab) -3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key - fwd_rnd2( -16(%ebp), crypto_ft_tab) -4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key - fwd_rnd2( +16(%ebp), crypto_ft_tab) - fwd_rnd1( +32(%ebp), crypto_ft_tab) - fwd_rnd2( +48(%ebp), crypto_ft_tab) - fwd_rnd1( +64(%ebp), crypto_ft_tab) - fwd_rnd2( +80(%ebp), crypto_ft_tab) - fwd_rnd1( +96(%ebp), crypto_ft_tab) - fwd_rnd2(+112(%ebp), crypto_ft_tab) - fwd_rnd1(+128(%ebp), crypto_ft_tab) - fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table - -// move final values to the output array. CAUTION: the -// order of these assigns rely on the register mappings - - add $8,%esp - mov out_blk+12(%esp),%ebp - mov %r5,12(%ebp) - pop %edi - mov %r4,8(%ebp) - pop %esi - mov %r1,4(%ebp) - pop %ebx - mov %r0,(%ebp) - pop %ebp - ret -ENDPROC(aes_enc_blk) - -// AES (Rijndael) Decryption Subroutine -/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ - -.extern crypto_it_tab -.extern crypto_il_tab - -ENTRY(aes_dec_blk) - push %ebp - mov ctx(%esp),%ebp - -// CAUTION: the order and the values used in these assigns -// rely on the register mappings - -1: push %ebx - mov in_blk+4(%esp),%r2 - push %esi - mov klen(%ebp),%r3 // key size - push %edi -#if dkey != 0 - lea dkey(%ebp),%ebp // key pointer -#endif - -// input four columns and xor in first round key - - mov (%r2),%r0 - mov 4(%r2),%r1 - mov 8(%r2),%r4 - mov 12(%r2),%r5 - xor (%ebp),%r0 - xor 4(%ebp),%r1 - xor 8(%ebp),%r4 - xor 12(%ebp),%r5 - - sub $8,%esp // space for register saves on stack - add $16,%ebp // increment to next round key - cmp $24,%r3 - jb 4f // 10 rounds for 128-bit key - lea 32(%ebp),%ebp - je 3f // 12 rounds for 192-bit key - lea 32(%ebp),%ebp - -2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key - inv_rnd2( -48(%ebp), crypto_it_tab) -3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key - inv_rnd2( -16(%ebp), crypto_it_tab) -4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key - inv_rnd2( +16(%ebp), crypto_it_tab) - inv_rnd1( +32(%ebp), crypto_it_tab) - inv_rnd2( +48(%ebp), crypto_it_tab) - inv_rnd1( +64(%ebp), crypto_it_tab) - inv_rnd2( +80(%ebp), crypto_it_tab) - inv_rnd1( +96(%ebp), crypto_it_tab) - inv_rnd2(+112(%ebp), crypto_it_tab) - inv_rnd1(+128(%ebp), crypto_it_tab) - inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table - -// move final values to the output array. CAUTION: the -// order of these assigns rely on the register mappings - - add $8,%esp - mov out_blk+12(%esp),%ebp - mov %r5,12(%ebp) - pop %edi - mov %r4,8(%ebp) - pop %esi - mov %r1,4(%ebp) - pop %ebx - mov %r0,(%ebp) - pop %ebp - ret -ENDPROC(aes_dec_blk) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S deleted file mode 100644 index 8739cf7795de..000000000000 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ /dev/null @@ -1,185 +0,0 @@ -/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 - * - * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> - * - * License: - * This code can be distributed under the terms of the GNU General Public - * License (GPL) Version 2 provided that the above header down to and - * including this sentence is retained in full. - */ - -.extern crypto_ft_tab -.extern crypto_it_tab -.extern crypto_fl_tab -.extern crypto_il_tab - -.text - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -#define R1 %rax -#define R1E %eax -#define R1X %ax -#define R1H %ah -#define R1L %al -#define R2 %rbx -#define R2E %ebx -#define R2X %bx -#define R2H %bh -#define R2L %bl -#define R3 %rcx -#define R3E %ecx -#define R3X %cx -#define R3H %ch -#define R3L %cl -#define R4 %rdx -#define R4E %edx -#define R4X %dx -#define R4H %dh -#define R4L %dl -#define R5 %rsi -#define R5E %esi -#define R6 %rdi -#define R6E %edi -#define R7 %r9 /* don't use %rbp; it breaks stack traces */ -#define R7E %r9d -#define R8 %r8 -#define R10 %r10 -#define R11 %r11 - -#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ - ENTRY(FUNC); \ - movq r1,r2; \ - leaq KEY+48(r8),r9; \ - movq r10,r11; \ - movl (r7),r5 ## E; \ - movl 4(r7),r1 ## E; \ - movl 8(r7),r6 ## E; \ - movl 12(r7),r7 ## E; \ - movl 480(r8),r10 ## E; \ - xorl -48(r9),r5 ## E; \ - xorl -44(r9),r1 ## E; \ - xorl -40(r9),r6 ## E; \ - xorl -36(r9),r7 ## E; \ - cmpl $24,r10 ## E; \ - jb B128; \ - leaq 32(r9),r9; \ - je B192; \ - leaq 32(r9),r9; - -#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ - movq r1,r2; \ - movl r5 ## E,(r9); \ - movl r6 ## E,4(r9); \ - movl r7 ## E,8(r9); \ - movl r8 ## E,12(r9); \ - ret; \ - ENDPROC(FUNC); - -#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ - movzbl r2 ## H,r5 ## E; \ - movzbl r2 ## L,r6 ## E; \ - movl TAB+1024(,r5,4),r5 ## E;\ - movw r4 ## X,r2 ## X; \ - movl TAB(,r6,4),r6 ## E; \ - roll $16,r2 ## E; \ - shrl $16,r4 ## E; \ - movzbl r4 ## L,r7 ## E; \ - movzbl r4 ## H,r4 ## E; \ - xorl OFFSET(r8),ra ## E; \ - xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r4,4),r5 ## E;\ - xorl TAB+2048(,r7,4),r6 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r4 ## E; \ - movl TAB+1024(,r4,4),r4 ## E;\ - movw r3 ## X,r1 ## X; \ - roll $16,r1 ## E; \ - shrl $16,r3 ## E; \ - xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## L,r7 ## E; \ - movzbl r3 ## H,r3 ## E; \ - xorl TAB+3072(,r3,4),r4 ## E;\ - xorl TAB+2048(,r7,4),r5 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r3 ## E; \ - shrl $16,r1 ## E; \ - xorl TAB+3072(,r3,4),r6 ## E;\ - movl TAB+2048(,r7,4),r3 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r1 ## E; \ - xorl TAB+1024(,r1,4),r6 ## E;\ - xorl TAB(,r7,4),r3 ## E; \ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r7 ## E; \ - shrl $16,r2 ## E; \ - xorl TAB+3072(,r1,4),r3 ## E;\ - xorl TAB+2048(,r7,4),r4 ## E;\ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r2 ## E; \ - xorl OFFSET+8(r8),rc ## E; \ - xorl OFFSET+12(r8),rd ## E; \ - xorl TAB+1024(,r1,4),r3 ## E;\ - xorl TAB(,r2,4),r4 ## E; - -#define move_regs(r1,r2,r3,r4) \ - movl r3 ## E,r1 ## E; \ - movl r4 ## E,r2 ## E; - -#define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) - -#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) - -#define encrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define encrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) - -#define decrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define decrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) - -/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_enc_blk,0,.Le128,.Le192) - encrypt_round(crypto_ft_tab,-96) - encrypt_round(crypto_ft_tab,-80) -.Le192: encrypt_round(crypto_ft_tab,-64) - encrypt_round(crypto_ft_tab,-48) -.Le128: encrypt_round(crypto_ft_tab,-32) - encrypt_round(crypto_ft_tab,-16) - encrypt_round(crypto_ft_tab, 0) - encrypt_round(crypto_ft_tab, 16) - encrypt_round(crypto_ft_tab, 32) - encrypt_round(crypto_ft_tab, 48) - encrypt_round(crypto_ft_tab, 64) - encrypt_round(crypto_ft_tab, 80) - encrypt_round(crypto_ft_tab, 96) - encrypt_final(crypto_fl_tab,112) - return(aes_enc_blk) - -/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_dec_blk,240,.Ld128,.Ld192) - decrypt_round(crypto_it_tab,-96) - decrypt_round(crypto_it_tab,-80) -.Ld192: decrypt_round(crypto_it_tab,-64) - decrypt_round(crypto_it_tab,-48) -.Ld128: decrypt_round(crypto_it_tab,-32) - decrypt_round(crypto_it_tab,-16) - decrypt_round(crypto_it_tab, 0) - decrypt_round(crypto_it_tab, 16) - decrypt_round(crypto_it_tab, 32) - decrypt_round(crypto_it_tab, 48) - decrypt_round(crypto_it_tab, 64) - decrypt_round(crypto_it_tab, 80) - decrypt_round(crypto_it_tab, 96) - decrypt_final(crypto_il_tab,112) - return(aes_dec_blk) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 5f6a5af9c489..ec437db1fa54 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -544,11 +544,11 @@ ddq_add_8: * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_128_avx_by8) +SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_128 -ENDPROC(aes_ctr_enc_128_avx_by8) +SYM_FUNC_END(aes_ctr_enc_128_avx_by8) /* * routine to do AES192 CTR enc/decrypt "by8" @@ -557,11 +557,11 @@ ENDPROC(aes_ctr_enc_128_avx_by8) * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_192_avx_by8) +SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_192 -ENDPROC(aes_ctr_enc_192_avx_by8) +SYM_FUNC_END(aes_ctr_enc_192_avx_by8) /* * routine to do AES256 CTR enc/decrypt "by8" @@ -570,8 +570,8 @@ ENDPROC(aes_ctr_enc_192_avx_by8) * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ -ENTRY(aes_ctr_enc_256_avx_by8) +SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_256 -ENDPROC(aes_ctr_enc_256_avx_by8) +SYM_FUNC_END(aes_ctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 9e9d819e8bc3..7b7dc05fa1a4 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -1,71 +1 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * Glue Code for the asm optimized version of the AES Cipher Algorithm - * - */ - -#include <linux/module.h> -#include <crypto/aes.h> -#include <asm/crypto/aes.h> - -asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); -asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); - -void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) -{ - aes_enc_blk(ctx, dst, src); -} -EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86); - -void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) -{ - aes_dec_blk(ctx, dst, src); -} -EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86); - -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_enc_blk(crypto_tfm_ctx(tfm), dst, src); -} - -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_dec_blk(crypto_tfm_ctx(tfm), dst, src); -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-asm", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = crypto_aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - -static int __init aes_init(void) -{ - return crypto_register_alg(&aes_alg); -} - -static void __exit aes_fini(void) -{ - crypto_unregister_alg(&aes_alg); -} - -module_init(aes_init); -module_exit(aes_fini); - -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("aes"); -MODULE_ALIAS_CRYPTO("aes-asm"); diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e40bdf024ba7..cad6e1bfa7d5 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1592,7 +1592,7 @@ _esb_loop_\@: * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ -ENTRY(aesni_gcm_dec) +SYM_FUNC_START(aesni_gcm_dec) FUNC_SAVE GCM_INIT %arg6, arg7, arg8, arg9 @@ -1600,7 +1600,7 @@ ENTRY(aesni_gcm_dec) GCM_COMPLETE arg10, arg11 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec) +SYM_FUNC_END(aesni_gcm_dec) /***************************************************************************** @@ -1680,7 +1680,7 @@ ENDPROC(aesni_gcm_dec) * * poly = x^128 + x^127 + x^126 + x^121 + 1 ***************************************************************************/ -ENTRY(aesni_gcm_enc) +SYM_FUNC_START(aesni_gcm_enc) FUNC_SAVE GCM_INIT %arg6, arg7, arg8, arg9 @@ -1689,7 +1689,7 @@ ENTRY(aesni_gcm_enc) GCM_COMPLETE arg10, arg11 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc) +SYM_FUNC_END(aesni_gcm_enc) /***************************************************************************** * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1702,12 +1702,12 @@ ENDPROC(aesni_gcm_enc) * const u8 *aad, // Additional Authentication Data (AAD) * u64 aad_len) // Length of AAD in bytes. */ -ENTRY(aesni_gcm_init) +SYM_FUNC_START(aesni_gcm_init) FUNC_SAVE GCM_INIT %arg3, %arg4,%arg5, %arg6 FUNC_RESTORE ret -ENDPROC(aesni_gcm_init) +SYM_FUNC_END(aesni_gcm_init) /***************************************************************************** * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1717,12 +1717,12 @@ ENDPROC(aesni_gcm_init) * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. */ -ENTRY(aesni_gcm_enc_update) +SYM_FUNC_START(aesni_gcm_enc_update) FUNC_SAVE GCM_ENC_DEC enc FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update) +SYM_FUNC_END(aesni_gcm_enc_update) /***************************************************************************** * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1732,12 +1732,12 @@ ENDPROC(aesni_gcm_enc_update) * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. */ -ENTRY(aesni_gcm_dec_update) +SYM_FUNC_START(aesni_gcm_dec_update) FUNC_SAVE GCM_ENC_DEC dec FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update) +SYM_FUNC_END(aesni_gcm_dec_update) /***************************************************************************** * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1747,19 +1747,18 @@ ENDPROC(aesni_gcm_dec_update) * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), * // 12 or 8. */ -ENTRY(aesni_gcm_finalize) +SYM_FUNC_START(aesni_gcm_finalize) FUNC_SAVE GCM_COMPLETE %arg3 %arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize) +SYM_FUNC_END(aesni_gcm_finalize) #endif -.align 4 -_key_expansion_128: -_key_expansion_256a: +SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) +SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1769,11 +1768,10 @@ _key_expansion_256a: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_128) -ENDPROC(_key_expansion_256a) +SYM_FUNC_END(_key_expansion_256a) +SYM_FUNC_END_ALIAS(_key_expansion_128) -.align 4 -_key_expansion_192a: +SYM_FUNC_START_LOCAL(_key_expansion_192a) pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1795,10 +1793,9 @@ _key_expansion_192a: movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP ret -ENDPROC(_key_expansion_192a) +SYM_FUNC_END(_key_expansion_192a) -.align 4 -_key_expansion_192b: +SYM_FUNC_START_LOCAL(_key_expansion_192b) pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 @@ -1815,10 +1812,9 @@ _key_expansion_192b: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_192b) +SYM_FUNC_END(_key_expansion_192b) -.align 4 -_key_expansion_256b: +SYM_FUNC_START_LOCAL(_key_expansion_256b) pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 pxor %xmm4, %xmm2 @@ -1828,13 +1824,13 @@ _key_expansion_256b: movaps %xmm2, (TKEYP) add $0x10, TKEYP ret -ENDPROC(_key_expansion_256b) +SYM_FUNC_END(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, * unsigned int key_len) */ -ENTRY(aesni_set_key) +SYM_FUNC_START(aesni_set_key) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -1943,12 +1939,12 @@ ENTRY(aesni_set_key) #endif FRAME_END ret -ENDPROC(aesni_set_key) +SYM_FUNC_END(aesni_set_key) /* - * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) */ -ENTRY(aesni_enc) +SYM_FUNC_START(aesni_enc) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -1967,7 +1963,7 @@ ENTRY(aesni_enc) #endif FRAME_END ret -ENDPROC(aesni_enc) +SYM_FUNC_END(aesni_enc) /* * _aesni_enc1: internal ABI @@ -1981,8 +1977,7 @@ ENDPROC(aesni_enc) * KEY * TKEYP (T1) */ -.align 4 -_aesni_enc1: +SYM_FUNC_START_LOCAL(_aesni_enc1) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 @@ -2025,7 +2020,7 @@ _aesni_enc1: movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE ret -ENDPROC(_aesni_enc1) +SYM_FUNC_END(_aesni_enc1) /* * _aesni_enc4: internal ABI @@ -2045,8 +2040,7 @@ ENDPROC(_aesni_enc1) * KEY * TKEYP (T1) */ -.align 4 -_aesni_enc4: +SYM_FUNC_START_LOCAL(_aesni_enc4) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 @@ -2134,12 +2128,12 @@ _aesni_enc4: AESENCLAST KEY STATE3 AESENCLAST KEY STATE4 ret -ENDPROC(_aesni_enc4) +SYM_FUNC_END(_aesni_enc4) /* - * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) + * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) */ -ENTRY(aesni_dec) +SYM_FUNC_START(aesni_dec) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP @@ -2159,7 +2153,7 @@ ENTRY(aesni_dec) #endif FRAME_END ret -ENDPROC(aesni_dec) +SYM_FUNC_END(aesni_dec) /* * _aesni_dec1: internal ABI @@ -2173,8 +2167,7 @@ ENDPROC(aesni_dec) * KEY * TKEYP (T1) */ -.align 4 -_aesni_dec1: +SYM_FUNC_START_LOCAL(_aesni_dec1) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 @@ -2217,7 +2210,7 @@ _aesni_dec1: movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE ret -ENDPROC(_aesni_dec1) +SYM_FUNC_END(_aesni_dec1) /* * _aesni_dec4: internal ABI @@ -2237,8 +2230,7 @@ ENDPROC(_aesni_dec1) * KEY * TKEYP (T1) */ -.align 4 -_aesni_dec4: +SYM_FUNC_START_LOCAL(_aesni_dec4) movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 @@ -2326,13 +2318,13 @@ _aesni_dec4: AESDECLAST KEY STATE3 AESDECLAST KEY STATE4 ret -ENDPROC(_aesni_dec4) +SYM_FUNC_END(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len) */ -ENTRY(aesni_ecb_enc) +SYM_FUNC_START(aesni_ecb_enc) FRAME_BEGIN #ifndef __x86_64__ pushl LEN @@ -2386,13 +2378,13 @@ ENTRY(aesni_ecb_enc) #endif FRAME_END ret -ENDPROC(aesni_ecb_enc) +SYM_FUNC_END(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len); */ -ENTRY(aesni_ecb_dec) +SYM_FUNC_START(aesni_ecb_dec) FRAME_BEGIN #ifndef __x86_64__ pushl LEN @@ -2447,13 +2439,13 @@ ENTRY(aesni_ecb_dec) #endif FRAME_END ret -ENDPROC(aesni_ecb_dec) +SYM_FUNC_END(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_cbc_enc) +SYM_FUNC_START(aesni_cbc_enc) FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2491,13 +2483,13 @@ ENTRY(aesni_cbc_enc) #endif FRAME_END ret -ENDPROC(aesni_cbc_enc) +SYM_FUNC_END(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_cbc_dec) +SYM_FUNC_START(aesni_cbc_dec) FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2584,7 +2576,7 @@ ENTRY(aesni_cbc_dec) #endif FRAME_END ret -ENDPROC(aesni_cbc_dec) +SYM_FUNC_END(aesni_cbc_dec) #ifdef __x86_64__ .pushsection .rodata @@ -2604,8 +2596,7 @@ ENDPROC(aesni_cbc_dec) * INC: == 1, in little endian * BSWAP_MASK == endian swapping mask */ -.align 4 -_aesni_inc_init: +SYM_FUNC_START_LOCAL(_aesni_inc_init) movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR PSHUFB_XMM BSWAP_MASK CTR @@ -2613,7 +2604,7 @@ _aesni_inc_init: MOVQ_R64_XMM TCTR_LOW INC MOVQ_R64_XMM CTR TCTR_LOW ret -ENDPROC(_aesni_inc_init) +SYM_FUNC_END(_aesni_inc_init) /* * _aesni_inc: internal ABI @@ -2630,8 +2621,7 @@ ENDPROC(_aesni_inc_init) * CTR: == output IV, in little endian * TCTR_LOW: == lower qword of CTR */ -.align 4 -_aesni_inc: +SYM_FUNC_START_LOCAL(_aesni_inc) paddq INC, CTR add $1, TCTR_LOW jnc .Linc_low @@ -2642,13 +2632,13 @@ _aesni_inc: movaps CTR, IV PSHUFB_XMM BSWAP_MASK IV ret -ENDPROC(_aesni_inc) +SYM_FUNC_END(_aesni_inc) /* * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ -ENTRY(aesni_ctr_enc) +SYM_FUNC_START(aesni_ctr_enc) FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret @@ -2705,7 +2695,7 @@ ENTRY(aesni_ctr_enc) .Lctr_enc_just_ret: FRAME_END ret -ENDPROC(aesni_ctr_enc) +SYM_FUNC_END(aesni_ctr_enc) /* * _aesni_gf128mul_x_ble: internal ABI @@ -2726,10 +2716,10 @@ ENDPROC(aesni_ctr_enc) pxor CTR, IV; /* - * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, - * bool enc, u8 *iv) + * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, bool enc, le128 *iv) */ -ENTRY(aesni_xts_crypt8) +SYM_FUNC_START(aesni_xts_crypt8) FRAME_BEGIN cmpb $0, %cl movl $0, %ecx @@ -2833,6 +2823,6 @@ ENTRY(aesni_xts_crypt8) FRAME_END ret -ENDPROC(aesni_xts_crypt8) +SYM_FUNC_END(aesni_xts_crypt8) #endif diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 91c039ab5699..bfa1c0b3e5b4 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -1775,12 +1775,12 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_init_avx_gen2) +SYM_FUNC_START(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret -ENDPROC(aesni_gcm_init_avx_gen2) +SYM_FUNC_END(aesni_gcm_init_avx_gen2) ############################################################################### #void aesni_gcm_enc_update_avx_gen2( @@ -1790,7 +1790,7 @@ ENDPROC(aesni_gcm_init_avx_gen2) # const u8 *in, /* Plaintext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_update_avx_gen2) +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) FUNC_SAVE mov keysize, %eax cmp $32, %eax @@ -1809,7 +1809,7 @@ key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update_avx_gen2) +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) ############################################################################### #void aesni_gcm_dec_update_avx_gen2( @@ -1819,7 +1819,7 @@ ENDPROC(aesni_gcm_enc_update_avx_gen2) # const u8 *in, /* Ciphertext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_update_avx_gen2) +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -1838,7 +1838,7 @@ key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update_avx_gen2) +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) ############################################################################### #void aesni_gcm_finalize_avx_gen2( @@ -1848,7 +1848,7 @@ ENDPROC(aesni_gcm_dec_update_avx_gen2) # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_finalize_avx_gen2) +SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -1867,7 +1867,7 @@ key_256_finalize: GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize_avx_gen2) +SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2746,12 +2746,12 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_init_avx_gen4) +SYM_FUNC_START(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret -ENDPROC(aesni_gcm_init_avx_gen4) +SYM_FUNC_END(aesni_gcm_init_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( @@ -2761,7 +2761,7 @@ ENDPROC(aesni_gcm_init_avx_gen4) # const u8 *in, /* Plaintext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_update_avx_gen4) +SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2780,7 +2780,7 @@ key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_update_avx_gen4) +SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) ############################################################################### #void aesni_gcm_dec_update_avx_gen4( @@ -2790,7 +2790,7 @@ ENDPROC(aesni_gcm_enc_update_avx_gen4) # const u8 *in, /* Ciphertext input */ # u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_update_avx_gen4) +SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2809,7 +2809,7 @@ key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_update_avx_gen4) +SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) ############################################################################### #void aesni_gcm_finalize_avx_gen4( @@ -2819,7 +2819,7 @@ ENDPROC(aesni_gcm_dec_update_avx_gen4) # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_finalize_avx_gen4) +SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax @@ -2838,6 +2838,6 @@ key_256_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 FUNC_RESTORE ret -ENDPROC(aesni_gcm_finalize_avx_gen4) +SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 73c0ccb009a0..bbbebbd35b5d 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -26,7 +26,6 @@ #include <crypto/gcm.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> -#include <asm/crypto/aes.h> #include <asm/simd.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> @@ -84,10 +83,8 @@ struct gcm_context_data { asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); -asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in); -asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in); +asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); +asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out, @@ -107,8 +104,8 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, bool enc, u8 *iv); +asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, bool enc, le128 *iv); /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. @@ -319,17 +316,14 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx); - u32 *flags = &tfm->crt_flags; int err; if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + key_len != AES_KEYSIZE_256) return -EINVAL; - } if (!crypto_simd_usable()) - err = crypto_aes_expand_key(ctx, in_key, key_len); + err = aes_expandkey(ctx, in_key, key_len); else { kernel_fpu_begin(); err = aesni_set_key(ctx, in_key, key_len); @@ -345,26 +339,26 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len); } -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!crypto_simd_usable()) - crypto_aes_encrypt_x86(ctx, dst, src); - else { + if (!crypto_simd_usable()) { + aes_encrypt(ctx, dst, src); + } else { kernel_fpu_begin(); aesni_enc(ctx, dst, src); kernel_fpu_end(); } } -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!crypto_simd_usable()) - crypto_aes_decrypt_x86(ctx, dst, src); - else { + if (!crypto_simd_usable()) { + aes_decrypt(ctx, dst, src); + } else { kernel_fpu_begin(); aesni_dec(ctx, dst, src); kernel_fpu_end(); @@ -551,29 +545,24 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, } -static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) +static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_enc(ctx, out, in); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc); } -static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); } -static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); + aesni_xts_crypt8(ctx, dst, src, true, iv); } -static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); -} - -static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) -{ - aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); + aesni_xts_crypt8(ctx, dst, src, false, iv); } static const struct common_glue_ctx aesni_enc_xts = { @@ -582,10 +571,10 @@ static const struct common_glue_ctx aesni_enc_xts = { .funcs = { { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } + .fn_u = { .xts = aesni_xts_enc8 } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } + .fn_u = { .xts = aesni_xts_enc } } } }; @@ -595,10 +584,10 @@ static const struct common_glue_ctx aesni_dec_xts = { .funcs = { { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } + .fn_u = { .xts = aesni_xts_dec8 } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } + .fn_u = { .xts = aesni_xts_dec } } } }; @@ -607,10 +596,10 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&aesni_enc_xts, req, - XTS_TWEAK_CAST(aesni_xts_tweak), + return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc, aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + aes_ctx(ctx->raw_crypt_ctx), + false); } static int xts_decrypt(struct skcipher_request *req) @@ -618,35 +607,30 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&aesni_dec_xts, req, - XTS_TWEAK_CAST(aesni_xts_tweak), + return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc, aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + aes_ctx(ctx->raw_crypt_ctx), + true); } static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { - struct crypto_cipher *tfm; + struct crypto_aes_ctx ctx; int ret; - tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = crypto_cipher_setkey(tfm, key, key_len); + ret = aes_expandkey(&ctx, key, key_len); if (ret) - goto out_free_cipher; + return ret; /* Clear the data in the hash sub key container to zero.*/ /* We want to cipher all zeros to create the hash sub key. */ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey); + aes_encrypt(&ctx, hash_subkey, hash_subkey); -out_free_cipher: - crypto_free_cipher(tfm); - return ret; + memzero_explicit(&ctx, sizeof(ctx)); + return 0; } static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, @@ -654,10 +638,9 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, { struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - if (key_len < 4) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (key_len < 4) return -EINVAL; - } + /*Account for 4 byte nonce at the end.*/ key_len -= 4; @@ -919,8 +902,8 @@ static struct crypto_alg aesni_cipher_alg = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt + .cia_encrypt = aesni_encrypt, + .cia_decrypt = aesni_decrypt } } }; diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S new file mode 100644 index 000000000000..24910b766bdd --- /dev/null +++ b/arch/x86/crypto/blake2s-core.S @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +#ifdef CONFIG_AS_AVX512 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +#endif /* CONFIG_AS_AVX512 */ + +.text +#ifdef CONFIG_AS_SSSE3 +SYM_FUNC_START(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + ret +SYM_FUNC_END(blake2s_compress_ssse3) +#endif /* CONFIG_AS_SSSE3 */ + +#ifdef CONFIG_AS_AVX512 +SYM_FUNC_START(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + retq +SYM_FUNC_END(blake2s_compress_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c new file mode 100644 index 000000000000..06ef2d4a4701 --- /dev/null +++ b/arch/x86/crypto/blake2s-glue.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <crypto/internal/blake2s.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/hash.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/api.h> +#include <asm/processor.h> +#include <asm/simd.h> + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress_arch(struct blake2s_state *state, + const u8 *block, size_t nblocks, + const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + for (;;) { + const size_t blocks = min_t(size_t, nblocks, + PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + if (!nblocks) + break; + block += blocks * BLAKE2S_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(blake2s_compress_arch); + +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); + + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) + return -EINVAL; + + memcpy(tctx->key, key, keylen); + tctx->keylen = keylen; + + return 0; +} + +static int crypto_blake2s_init(struct shash_desc *desc) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct blake2s_state *state = shash_desc_ctx(desc); + const int outlen = crypto_shash_digestsize(desc->tfm); + + if (tctx->keylen) + blake2s_init_key(state, outlen, tctx->key, tctx->keylen); + else + blake2s_init(state, outlen); + + return 0; +} + +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, + unsigned int inlen) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; + + if (unlikely(!inlen)) + return 0; + if (inlen > fill) { + memcpy(state->buf + state->buflen, in, fill); + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + state->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + /* Hash one less (full) block than strictly possible */ + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(state->buf + state->buflen, in, inlen); + state->buflen += inlen; + + return 0; +} + +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + + blake2s_set_lastblock(state); + memset(state->buf + state->buflen, 0, + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ + blake2s_compress_arch(state, state->buf, 1, state->buflen); + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + memcpy(out, state->h, state->outlen); + memzero_explicit(state, sizeof(*state)); + + return 0; +} + +static struct shash_alg blake2s_algs[] = {{ + .base.cra_name = "blake2s-128", + .base.cra_driver_name = "blake2s-128-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_128_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-160", + .base.cra_driver_name = "blake2s-160-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_160_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-224", + .base.cra_driver_name = "blake2s-224-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_224_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-256", + .base.cra_driver_name = "blake2s-256-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_256_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}}; + +static int __init blake2s_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&blake2s_use_ssse3); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? + crypto_register_shashes(blake2s_algs, + ARRAY_SIZE(blake2s_algs)) : 0; +} + +static void __exit blake2s_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +module_init(blake2s_mod_init); +module_exit(blake2s_mod_exit); + +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 330db7a48af8..4222ac6d6584 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -103,7 +103,7 @@ bswapq RX0; \ xorq RX0, (RIO); -ENTRY(__blowfish_enc_blk) +SYM_FUNC_START(__blowfish_enc_blk) /* input: * %rdi: ctx * %rsi: dst @@ -139,9 +139,9 @@ ENTRY(__blowfish_enc_blk) .L__enc_xor: xor_block(); ret; -ENDPROC(__blowfish_enc_blk) +SYM_FUNC_END(__blowfish_enc_blk) -ENTRY(blowfish_dec_blk) +SYM_FUNC_START(blowfish_dec_blk) /* input: * %rdi: ctx * %rsi: dst @@ -171,7 +171,7 @@ ENTRY(blowfish_dec_blk) movq %r11, %r12; ret; -ENDPROC(blowfish_dec_blk) +SYM_FUNC_END(blowfish_dec_blk) /********************************************************************** 4-way blowfish, four blocks parallel @@ -283,7 +283,7 @@ ENDPROC(blowfish_dec_blk) bswapq RX3; \ xorq RX3, 24(RIO); -ENTRY(__blowfish_enc_blk_4way) +SYM_FUNC_START(__blowfish_enc_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -330,9 +330,9 @@ ENTRY(__blowfish_enc_blk_4way) popq %rbx; popq %r12; ret; -ENDPROC(__blowfish_enc_blk_4way) +SYM_FUNC_END(__blowfish_enc_blk_4way) -ENTRY(blowfish_dec_blk_4way) +SYM_FUNC_START(blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -365,4 +365,4 @@ ENTRY(blowfish_dec_blk_4way) popq %r12; ret; -ENDPROC(blowfish_dec_blk_4way) +SYM_FUNC_END(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index a14af6eb09cb..d01ddd73de65 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -189,20 +189,20 @@ * larger and would only be 0.5% faster (on sandy-bridge). */ .align 8 -roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: +SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rcx, (%r9)); ret; -ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) +SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 -roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: +SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, %rax, (%r9)); ret; -ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) +SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* * IN/OUT: @@ -722,7 +722,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text .align 8 -__camellia_enc_blk16: +SYM_FUNC_START_LOCAL(__camellia_enc_blk16) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes @@ -806,10 +806,10 @@ __camellia_enc_blk16: %xmm15, %rax, %rcx, 24); jmp .Lenc_done; -ENDPROC(__camellia_enc_blk16) +SYM_FUNC_END(__camellia_enc_blk16) .align 8 -__camellia_dec_blk16: +SYM_FUNC_START_LOCAL(__camellia_dec_blk16) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes @@ -891,9 +891,9 @@ __camellia_dec_blk16: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -ENDPROC(__camellia_dec_blk16) +SYM_FUNC_END(__camellia_dec_blk16) -ENTRY(camellia_ecb_enc_16way) +SYM_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -916,9 +916,9 @@ ENTRY(camellia_ecb_enc_16way) FRAME_END ret; -ENDPROC(camellia_ecb_enc_16way) +SYM_FUNC_END(camellia_ecb_enc_16way) -ENTRY(camellia_ecb_dec_16way) +SYM_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -946,9 +946,9 @@ ENTRY(camellia_ecb_dec_16way) FRAME_END ret; -ENDPROC(camellia_ecb_dec_16way) +SYM_FUNC_END(camellia_ecb_dec_16way) -ENTRY(camellia_cbc_dec_16way) +SYM_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -997,7 +997,7 @@ ENTRY(camellia_cbc_dec_16way) FRAME_END ret; -ENDPROC(camellia_cbc_dec_16way) +SYM_FUNC_END(camellia_cbc_dec_16way) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -1005,7 +1005,7 @@ ENDPROC(camellia_cbc_dec_16way) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -ENTRY(camellia_ctr_16way) +SYM_FUNC_START(camellia_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1110,7 +1110,7 @@ ENTRY(camellia_ctr_16way) FRAME_END ret; -ENDPROC(camellia_ctr_16way) +SYM_FUNC_END(camellia_ctr_16way) #define gf128mul_x_ble(iv, mask, tmp) \ vpsrad $31, iv, tmp; \ @@ -1120,7 +1120,7 @@ ENDPROC(camellia_ctr_16way) vpxor tmp, iv, iv; .align 8 -camellia_xts_crypt_16way: +SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1254,9 +1254,9 @@ camellia_xts_crypt_16way: FRAME_END ret; -ENDPROC(camellia_xts_crypt_16way) +SYM_FUNC_END(camellia_xts_crypt_16way) -ENTRY(camellia_xts_enc_16way) +SYM_FUNC_START(camellia_xts_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1268,9 +1268,9 @@ ENTRY(camellia_xts_enc_16way) leaq __camellia_enc_blk16, %r9; jmp camellia_xts_crypt_16way; -ENDPROC(camellia_xts_enc_16way) +SYM_FUNC_END(camellia_xts_enc_16way) -ENTRY(camellia_xts_dec_16way) +SYM_FUNC_START(camellia_xts_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1286,4 +1286,4 @@ ENTRY(camellia_xts_dec_16way) leaq __camellia_dec_blk16, %r9; jmp camellia_xts_crypt_16way; -ENDPROC(camellia_xts_dec_16way) +SYM_FUNC_END(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 4be4c7c3ba27..563ef6e83cdd 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -223,20 +223,20 @@ * larger and would only marginally faster. */ .align 8 -roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: +SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rcx, (%r9)); ret; -ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) +SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 -roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: +SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, %rax, (%r9)); ret; -ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) +SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* * IN/OUT: @@ -760,7 +760,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .text .align 8 -__camellia_enc_blk32: +SYM_FUNC_START_LOCAL(__camellia_enc_blk32) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes @@ -844,10 +844,10 @@ __camellia_enc_blk32: %ymm15, %rax, %rcx, 24); jmp .Lenc_done; -ENDPROC(__camellia_enc_blk32) +SYM_FUNC_END(__camellia_enc_blk32) .align 8 -__camellia_dec_blk32: +SYM_FUNC_START_LOCAL(__camellia_dec_blk32) /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes @@ -929,9 +929,9 @@ __camellia_dec_blk32: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -ENDPROC(__camellia_dec_blk32) +SYM_FUNC_END(__camellia_dec_blk32) -ENTRY(camellia_ecb_enc_32way) +SYM_FUNC_START(camellia_ecb_enc_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -958,9 +958,9 @@ ENTRY(camellia_ecb_enc_32way) FRAME_END ret; -ENDPROC(camellia_ecb_enc_32way) +SYM_FUNC_END(camellia_ecb_enc_32way) -ENTRY(camellia_ecb_dec_32way) +SYM_FUNC_START(camellia_ecb_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -992,9 +992,9 @@ ENTRY(camellia_ecb_dec_32way) FRAME_END ret; -ENDPROC(camellia_ecb_dec_32way) +SYM_FUNC_END(camellia_ecb_dec_32way) -ENTRY(camellia_cbc_dec_32way) +SYM_FUNC_START(camellia_cbc_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1060,7 +1060,7 @@ ENTRY(camellia_cbc_dec_32way) FRAME_END ret; -ENDPROC(camellia_cbc_dec_32way) +SYM_FUNC_END(camellia_cbc_dec_32way) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -1076,7 +1076,7 @@ ENDPROC(camellia_cbc_dec_32way) vpslldq $8, tmp1, tmp1; \ vpsubq tmp1, x, x; -ENTRY(camellia_ctr_32way) +SYM_FUNC_START(camellia_ctr_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1200,7 +1200,7 @@ ENTRY(camellia_ctr_32way) FRAME_END ret; -ENDPROC(camellia_ctr_32way) +SYM_FUNC_END(camellia_ctr_32way) #define gf128mul_x_ble(iv, mask, tmp) \ vpsrad $31, iv, tmp; \ @@ -1222,7 +1222,7 @@ ENDPROC(camellia_ctr_32way) vpxor tmp1, iv, iv; .align 8 -camellia_xts_crypt_32way: +SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1367,9 +1367,9 @@ camellia_xts_crypt_32way: FRAME_END ret; -ENDPROC(camellia_xts_crypt_32way) +SYM_FUNC_END(camellia_xts_crypt_32way) -ENTRY(camellia_xts_enc_32way) +SYM_FUNC_START(camellia_xts_enc_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1382,9 +1382,9 @@ ENTRY(camellia_xts_enc_32way) leaq __camellia_enc_blk32, %r9; jmp camellia_xts_crypt_32way; -ENDPROC(camellia_xts_enc_32way) +SYM_FUNC_END(camellia_xts_enc_32way) -ENTRY(camellia_xts_dec_32way) +SYM_FUNC_START(camellia_xts_dec_32way) /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) @@ -1400,4 +1400,4 @@ ENTRY(camellia_xts_dec_32way) leaq __camellia_dec_blk32, %r9; jmp camellia_xts_crypt_32way; -ENDPROC(camellia_xts_dec_32way) +SYM_FUNC_END(camellia_xts_dec_32way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 23528bc18fc6..1372e6408850 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -175,7 +175,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -ENTRY(__camellia_enc_blk) +SYM_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -220,9 +220,9 @@ ENTRY(__camellia_enc_blk) movq RR12, %r12; ret; -ENDPROC(__camellia_enc_blk) +SYM_FUNC_END(__camellia_enc_blk) -ENTRY(camellia_dec_blk) +SYM_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -258,7 +258,7 @@ ENTRY(camellia_dec_blk) movq RR12, %r12; ret; -ENDPROC(camellia_dec_blk) +SYM_FUNC_END(camellia_dec_blk) /********************************************************************** 2-way camellia @@ -409,7 +409,7 @@ ENDPROC(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -ENTRY(__camellia_enc_blk_2way) +SYM_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -456,9 +456,9 @@ ENTRY(__camellia_enc_blk_2way) movq RR12, %r12; popq %rbx; ret; -ENDPROC(__camellia_enc_blk_2way) +SYM_FUNC_END(__camellia_enc_blk_2way) -ENTRY(camellia_dec_blk_2way) +SYM_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -496,4 +496,4 @@ ENTRY(camellia_dec_blk_2way) movq RR12, %r12; movq RXOR, %rbx; ret; -ENDPROC(camellia_dec_blk_2way) +SYM_FUNC_END(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index abf298c272dc..ccda647422d6 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -19,20 +19,17 @@ #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 /* 32-way AVX2/AES-NI parallel cipher functions */ -asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); -asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static const struct common_glue_ctx camellia_enc = { .num_funcs = 4, @@ -40,16 +37,16 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } + .fn_u = { .ecb = camellia_ecb_enc_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + .fn_u = { .ecb = camellia_ecb_enc_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -59,16 +56,16 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } + .fn_u = { .ctr = camellia_ctr_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + .fn_u = { .ctr = camellia_ctr_16way } }, { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -78,13 +75,13 @@ static const struct common_glue_ctx camellia_enc_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } + .fn_u = { .xts = camellia_xts_enc_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + .fn_u = { .xts = camellia_xts_enc_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + .fn_u = { .xts = camellia_xts_enc } } } }; @@ -94,16 +91,16 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } + .fn_u = { .ecb = camellia_ecb_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + .fn_u = { .ecb = camellia_ecb_dec_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -113,16 +110,16 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } + .fn_u = { .cbc = camellia_cbc_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + .fn_u = { .cbc = camellia_cbc_dec_16way } }, { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -132,21 +129,20 @@ static const struct common_glue_ctx camellia_dec_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } + .fn_u = { .xts = camellia_xts_dec_32way } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + .fn_u = { .xts = camellia_xts_dec_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + .fn_u = { .xts = camellia_xts_dec } } } }; static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, - &tfm->base.crt_flags); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) @@ -161,8 +157,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -180,9 +175,8 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_enc_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -190,9 +184,8 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_dec_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 0c22d84750a3..4e5de6ef206e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -18,41 +18,36 @@ #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ -asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); -asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); -asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_ctr_16way); -asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); -asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); -void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(camellia_enc_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk); } EXPORT_SYMBOL_GPL(camellia_xts_enc); -void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(camellia_dec_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk); } EXPORT_SYMBOL_GPL(camellia_xts_dec); @@ -62,13 +57,13 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + .fn_u = { .ecb = camellia_ecb_enc_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -78,13 +73,13 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + .fn_u = { .ctr = camellia_ctr_16way } }, { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -94,10 +89,10 @@ static const struct common_glue_ctx camellia_enc_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + .fn_u = { .xts = camellia_xts_enc_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + .fn_u = { .xts = camellia_xts_enc } } } }; @@ -107,13 +102,13 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + .fn_u = { .ecb = camellia_ecb_dec_16way } }, { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -123,13 +118,13 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + .fn_u = { .cbc = camellia_cbc_dec_16way } }, { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -139,18 +134,17 @@ static const struct common_glue_ctx camellia_dec_xts = { .funcs = { { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + .fn_u = { .xts = camellia_xts_dec_16way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + .fn_u = { .xts = camellia_xts_dec } } } }; static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, - &tfm->base.crt_flags); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) @@ -165,8 +159,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -183,7 +176,6 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -191,13 +183,12 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } EXPORT_SYMBOL_GPL(xts_camellia_setkey); @@ -206,9 +197,8 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_enc_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -216,9 +206,8 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&camellia_dec_xts, req, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 7c62db56ffe1..242c056e5fa8 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -18,19 +18,17 @@ #include <asm/crypto/glue_helper.h> /* regular block cipher functions */ -asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); +asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, + bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk); -asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk); /* 2-way parallel cipher functions */ -asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src, bool xor); +asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, + bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way); -asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk_2way); static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) @@ -1229,12 +1227,10 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey) } int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, - unsigned int key_len, u32 *flags) + unsigned int key_len) { - if (key_len != 16 && key_len != 24 && key_len != 32) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + if (key_len != 16 && key_len != 24 && key_len != 32) return -EINVAL; - } cctx->key_length = key_len; @@ -1257,8 +1253,7 @@ EXPORT_SYMBOL_GPL(__camellia_setkey); static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { - return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len, - &tfm->crt_flags); + return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len); } static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, @@ -1267,8 +1262,10 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return camellia_setkey(&tfm->base, key, key_len); } -void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) +void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) { + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; u128 iv = *src; camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); @@ -1277,9 +1274,11 @@ void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) *dst = *src; @@ -1291,9 +1290,11 @@ void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(camellia_crypt_ctr); -void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[2]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) { dst[0] = src[0]; @@ -1315,10 +1316,10 @@ static const struct common_glue_ctx camellia_enc = { .funcs = { { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + .fn_u = { .ecb = camellia_enc_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + .fn_u = { .ecb = camellia_enc_blk } } } }; @@ -1328,10 +1329,10 @@ static const struct common_glue_ctx camellia_ctr = { .funcs = { { .num_blocks = 2, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + .fn_u = { .ctr = camellia_crypt_ctr_2way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + .fn_u = { .ctr = camellia_crypt_ctr } } } }; @@ -1341,10 +1342,10 @@ static const struct common_glue_ctx camellia_dec = { .funcs = { { .num_blocks = 2, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + .fn_u = { .ecb = camellia_dec_blk_2way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .ecb = camellia_dec_blk } } } }; @@ -1354,10 +1355,10 @@ static const struct common_glue_ctx camellia_dec_cbc = { .funcs = { { .num_blocks = 2, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + .fn_u = { .cbc = camellia_decrypt_cbc_2way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + .fn_u = { .cbc = camellia_dec_blk } } } }; @@ -1373,8 +1374,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index dc55c3332fcc..8a6181b08b59 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -209,7 +209,7 @@ .text .align 16 -__cast5_enc_blk16: +SYM_FUNC_START_LOCAL(__cast5_enc_blk16) /* input: * %rdi: ctx * RL1: blocks 1 and 2 @@ -280,10 +280,10 @@ __cast5_enc_blk16: outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; -ENDPROC(__cast5_enc_blk16) +SYM_FUNC_END(__cast5_enc_blk16) .align 16 -__cast5_dec_blk16: +SYM_FUNC_START_LOCAL(__cast5_dec_blk16) /* input: * %rdi: ctx * RL1: encrypted blocks 1 and 2 @@ -357,9 +357,9 @@ __cast5_dec_blk16: .L__skip_dec: vpsrldq $4, RKR, RKR; jmp .L__dec_tail; -ENDPROC(__cast5_dec_blk16) +SYM_FUNC_END(__cast5_dec_blk16) -ENTRY(cast5_ecb_enc_16way) +SYM_FUNC_START(cast5_ecb_enc_16way) /* input: * %rdi: ctx * %rsi: dst @@ -394,9 +394,9 @@ ENTRY(cast5_ecb_enc_16way) popq %r15; FRAME_END ret; -ENDPROC(cast5_ecb_enc_16way) +SYM_FUNC_END(cast5_ecb_enc_16way) -ENTRY(cast5_ecb_dec_16way) +SYM_FUNC_START(cast5_ecb_dec_16way) /* input: * %rdi: ctx * %rsi: dst @@ -432,9 +432,9 @@ ENTRY(cast5_ecb_dec_16way) popq %r15; FRAME_END ret; -ENDPROC(cast5_ecb_dec_16way) +SYM_FUNC_END(cast5_ecb_dec_16way) -ENTRY(cast5_cbc_dec_16way) +SYM_FUNC_START(cast5_cbc_dec_16way) /* input: * %rdi: ctx * %rsi: dst @@ -484,9 +484,9 @@ ENTRY(cast5_cbc_dec_16way) popq %r12; FRAME_END ret; -ENDPROC(cast5_cbc_dec_16way) +SYM_FUNC_END(cast5_cbc_dec_16way) -ENTRY(cast5_ctr_16way) +SYM_FUNC_START(cast5_ctr_16way) /* input: * %rdi: ctx * %rsi: dst @@ -560,4 +560,4 @@ ENTRY(cast5_ctr_16way) popq %r12; FRAME_END ret; -ENDPROC(cast5_ctr_16way) +SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 4f0a7cdb94d9..932a3ce32a88 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -247,7 +247,7 @@ .text .align 8 -__cast6_enc_blk8: +SYM_FUNC_START_LOCAL(__cast6_enc_blk8) /* input: * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -292,10 +292,10 @@ __cast6_enc_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; -ENDPROC(__cast6_enc_blk8) +SYM_FUNC_END(__cast6_enc_blk8) .align 8 -__cast6_dec_blk8: +SYM_FUNC_START_LOCAL(__cast6_dec_blk8) /* input: * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks @@ -339,9 +339,9 @@ __cast6_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; -ENDPROC(__cast6_dec_blk8) +SYM_FUNC_END(__cast6_dec_blk8) -ENTRY(cast6_ecb_enc_8way) +SYM_FUNC_START(cast6_ecb_enc_8way) /* input: * %rdi: ctx * %rsi: dst @@ -362,9 +362,9 @@ ENTRY(cast6_ecb_enc_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_ecb_enc_8way) +SYM_FUNC_END(cast6_ecb_enc_8way) -ENTRY(cast6_ecb_dec_8way) +SYM_FUNC_START(cast6_ecb_dec_8way) /* input: * %rdi: ctx * %rsi: dst @@ -385,9 +385,9 @@ ENTRY(cast6_ecb_dec_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_ecb_dec_8way) +SYM_FUNC_END(cast6_ecb_dec_8way) -ENTRY(cast6_cbc_dec_8way) +SYM_FUNC_START(cast6_cbc_dec_8way) /* input: * %rdi: ctx * %rsi: dst @@ -411,9 +411,9 @@ ENTRY(cast6_cbc_dec_8way) popq %r12; FRAME_END ret; -ENDPROC(cast6_cbc_dec_8way) +SYM_FUNC_END(cast6_cbc_dec_8way) -ENTRY(cast6_ctr_8way) +SYM_FUNC_START(cast6_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -439,9 +439,9 @@ ENTRY(cast6_ctr_8way) popq %r12; FRAME_END ret; -ENDPROC(cast6_ctr_8way) +SYM_FUNC_END(cast6_ctr_8way) -ENTRY(cast6_xts_enc_8way) +SYM_FUNC_START(cast6_xts_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -466,9 +466,9 @@ ENTRY(cast6_xts_enc_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_xts_enc_8way) +SYM_FUNC_END(cast6_xts_enc_8way) -ENTRY(cast6_xts_dec_8way) +SYM_FUNC_START(cast6_xts_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -493,4 +493,4 @@ ENTRY(cast6_xts_dec_8way) popq %r15; FRAME_END ret; -ENDPROC(cast6_xts_dec_8way) +SYM_FUNC_END(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 645f8f16815c..48e0f37796fa 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -20,20 +20,17 @@ #define CAST6_PARALLEL_BLOCKS 8 -asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); - -asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, +asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); + +asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -41,21 +38,21 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__cast6_encrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt); } -static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__cast6_decrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt); } -static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -70,10 +67,10 @@ static const struct common_glue_ctx cast6_enc = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) } + .fn_u = { .ecb = cast6_ecb_enc_8way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } + .fn_u = { .ecb = __cast6_encrypt } } } }; @@ -83,10 +80,10 @@ static const struct common_glue_ctx cast6_ctr = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) } + .fn_u = { .ctr = cast6_ctr_8way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } + .fn_u = { .ctr = cast6_crypt_ctr } } } }; @@ -96,10 +93,10 @@ static const struct common_glue_ctx cast6_enc_xts = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) } + .fn_u = { .xts = cast6_xts_enc_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) } + .fn_u = { .xts = cast6_xts_enc } } } }; @@ -109,10 +106,10 @@ static const struct common_glue_ctx cast6_dec = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) } + .fn_u = { .ecb = cast6_ecb_dec_8way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } + .fn_u = { .ecb = __cast6_decrypt } } } }; @@ -122,10 +119,10 @@ static const struct common_glue_ctx cast6_dec_cbc = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) } + .fn_u = { .cbc = cast6_cbc_dec_8way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } + .fn_u = { .cbc = __cast6_decrypt } } } }; @@ -135,10 +132,10 @@ static const struct common_glue_ctx cast6_dec_xts = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) } + .fn_u = { .xts = cast6_xts_dec_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) } + .fn_u = { .xts = cast6_xts_dec } } } }; @@ -154,8 +151,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__cast6_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -177,7 +173,6 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -185,13 +180,12 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } static int xts_encrypt(struct skcipher_request *req) @@ -199,9 +193,8 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&cast6_enc_xts, req, - XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt, + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -209,9 +202,8 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&cast6_dec_xts, req, - XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt, + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg cast6_algs[] = { diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S index 831e4434fc20..ee9a40ab4109 100644 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -34,7 +34,7 @@ CTR4BL: .octa 0x00000000000000000000000000000002 .text -ENTRY(chacha_2block_xor_avx2) +SYM_FUNC_START(chacha_2block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 2 data blocks output, o # %rdx: up to 2 data blocks input, i @@ -224,9 +224,9 @@ ENTRY(chacha_2block_xor_avx2) lea -8(%r10),%rsp jmp .Ldone2 -ENDPROC(chacha_2block_xor_avx2) +SYM_FUNC_END(chacha_2block_xor_avx2) -ENTRY(chacha_4block_xor_avx2) +SYM_FUNC_START(chacha_4block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -529,9 +529,9 @@ ENTRY(chacha_4block_xor_avx2) lea -8(%r10),%rsp jmp .Ldone4 -ENDPROC(chacha_4block_xor_avx2) +SYM_FUNC_END(chacha_4block_xor_avx2) -ENTRY(chacha_8block_xor_avx2) +SYM_FUNC_START(chacha_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o # %rdx: up to 8 data blocks input, i @@ -1018,4 +1018,4 @@ ENTRY(chacha_8block_xor_avx2) jmp .Ldone8 -ENDPROC(chacha_8block_xor_avx2) +SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S index 848f9c75fd4f..bb193fde123a 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -24,7 +24,7 @@ CTR8BL: .octa 0x00000003000000020000000100000000 .text -ENTRY(chacha_2block_xor_avx512vl) +SYM_FUNC_START(chacha_2block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 2 data blocks output, o # %rdx: up to 2 data blocks input, i @@ -187,9 +187,9 @@ ENTRY(chacha_2block_xor_avx512vl) jmp .Ldone2 -ENDPROC(chacha_2block_xor_avx512vl) +SYM_FUNC_END(chacha_2block_xor_avx512vl) -ENTRY(chacha_4block_xor_avx512vl) +SYM_FUNC_START(chacha_4block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -453,9 +453,9 @@ ENTRY(chacha_4block_xor_avx512vl) jmp .Ldone4 -ENDPROC(chacha_4block_xor_avx512vl) +SYM_FUNC_END(chacha_4block_xor_avx512vl) -ENTRY(chacha_8block_xor_avx512vl) +SYM_FUNC_START(chacha_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o # %rdx: up to 8 data blocks input, i @@ -833,4 +833,4 @@ ENTRY(chacha_8block_xor_avx512vl) jmp .Ldone8 -ENDPROC(chacha_8block_xor_avx512vl) +SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index 2d86c7d6dc88..a38ab2512a6f 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -33,7 +33,7 @@ CTRINC: .octa 0x00000003000000020000000100000000 * * Clobbers: %r8d, %xmm4-%xmm7 */ -chacha_permute: +SYM_FUNC_START_LOCAL(chacha_permute) movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 @@ -109,9 +109,9 @@ chacha_permute: jnz .Ldoubleround ret -ENDPROC(chacha_permute) +SYM_FUNC_END(chacha_permute) -ENTRY(chacha_block_xor_ssse3) +SYM_FUNC_START(chacha_block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 1 data block output, o # %rdx: up to 1 data block input, i @@ -197,9 +197,9 @@ ENTRY(chacha_block_xor_ssse3) lea -8(%r10),%rsp jmp .Ldone -ENDPROC(chacha_block_xor_ssse3) +SYM_FUNC_END(chacha_block_xor_ssse3) -ENTRY(hchacha_block_ssse3) +SYM_FUNC_START(hchacha_block_ssse3) # %rdi: Input state matrix, s # %rsi: output (8 32-bit words) # %edx: nrounds @@ -218,9 +218,9 @@ ENTRY(hchacha_block_ssse3) FRAME_END ret -ENDPROC(hchacha_block_ssse3) +SYM_FUNC_END(hchacha_block_ssse3) -ENTRY(chacha_4block_xor_ssse3) +SYM_FUNC_START(chacha_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o # %rdx: up to 4 data blocks input, i @@ -788,4 +788,4 @@ ENTRY(chacha_4block_xor_ssse3) jmp .Ldone4 -ENDPROC(chacha_4block_xor_ssse3) +SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 388f95a4ec24..68a74953efaf 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -7,7 +7,7 @@ */ #include <crypto/algapi.h> -#include <crypto/chacha.h> +#include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); -#ifdef CONFIG_AS_AVX2 + asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx2; -#ifdef CONFIG_AS_AVX512 + asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx512vl; -#endif -#endif + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha_use_avx512vl) { + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif - if (chacha_use_avx2) { + + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; @@ -123,37 +123,76 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_walk *walk, +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, stream, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, bytes, nrounds); + kernel_fpu_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - int next_yield = 4096; /* bytes until next FPU yield */ - int err = 0; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, iv); - - while (walk->nbytes > 0) { - unsigned int nbytes = walk->nbytes; + chacha_init_generic(state, ctx->key, iv); - if (nbytes < walk->total) { - nbytes = round_down(nbytes, walk->stride); - next_yield -= nbytes; - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; - chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, - nbytes, ctx->nrounds); + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (next_yield <= 0) { - /* temporarily allow preemption */ - kernel_fpu_end(); + if (!static_branch_likely(&chacha_use_simd) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { kernel_fpu_begin(); - next_yield = 4096; + chacha_dosimd(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + kernel_fpu_end(); } - - err = skcipher_walk_done(walk, walk->nbytes - nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; @@ -163,55 +202,34 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; - - kernel_fpu_begin(); - err = chacha_simd_stream_xor(&walk, ctx, req->iv); - kernel_fpu_end(); - return err; + return chacha_simd_stream_xor(req, ctx, req->iv); } static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); + struct chacha_ctx subctx; u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + chacha_init_generic(state, ctx->key, req->iv); + + if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { + kernel_fpu_begin(); + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + kernel_fpu_end(); + } else { + hchacha_block_generic(state, subctx.key, ctx->nrounds); + } subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(&walk, &subctx, real_iv); - - kernel_fpu_end(); - - return err; + return chacha_simd_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { @@ -227,7 +245,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = chacha_simd, .decrypt = chacha_simd, }, { @@ -242,7 +260,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, { @@ -257,7 +275,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, + .setkey = chacha12_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, @@ -266,24 +284,29 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha_use_avx512vl = chacha_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); + return 0; + + static_branch_enable(&chacha_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } + return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? + crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; } static void __exit chacha_simd_mod_fini(void) { - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); + if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 1c099dc08cc3..9fd28ff65bc2 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -103,7 +103,7 @@ * size_t len, uint crc32) */ -ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ +SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ movdqa (BUF), %xmm1 movdqa 0x10(BUF), %xmm2 movdqa 0x20(BUF), %xmm3 @@ -238,4 +238,4 @@ fold_64: PEXTRD 0x01, %xmm1, %eax ret -ENDPROC(crc32_pclmul_le_16) +SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index cb4ab6645106..418bd88acac8 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -94,10 +94,8 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index eefa0862f309..c20d1b8a82c3 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -91,10 +91,8 @@ static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, { u32 *mctx = crypto_shash_ctx(hash); - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != sizeof(u32)) return -EINVAL; - } *mctx = le32_to_cpup((__le32 *)key); return 0; } diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index d9b734d0c8cc..0e6690e3618c 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -74,7 +74,7 @@ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); .text -ENTRY(crc_pcl) +SYM_FUNC_START(crc_pcl) #define bufp %rdi #define bufp_dw %edi #define bufp_w %di @@ -311,7 +311,7 @@ do_return: popq %rdi popq %rbx ret -ENDPROC(crc_pcl) +SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits ################################################################ diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index 3d873e67749d..b2533d63030e 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -95,7 +95,7 @@ # Assumes len >= 16. # .align 16 -ENTRY(crc_t10dif_pcl) +SYM_FUNC_START(crc_t10dif_pcl) movdqa .Lbswap_mask(%rip), BSWAP_MASK @@ -280,7 +280,7 @@ ENTRY(crc_t10dif_pcl) jge .Lfold_16_bytes_loop # 32 <= len <= 255 add $16, len jmp .Lhandle_partial_segment # 17 <= len <= 31 -ENDPROC(crc_t10dif_pcl) +SYM_FUNC_END(crc_t10dif_pcl) .section .rodata, "a", @progbits .align 16 diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c new file mode 100644 index 000000000000..eec7d2d24239 --- /dev/null +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -0,0 +1,2476 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <crypto/curve25519.h> +#include <crypto/internal/kpp.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); + +enum { NUM_WORDS_ELTFP25519 = 4 }; +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; + +#define mul_eltfp25519_1w_adx(c, a, b) do { \ + mul_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_1w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ + mul_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_1w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_adx(a) do { \ + sqr_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_1w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_bmi2(a) do { \ + sqr_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_1w_bmi2(a, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_adx(c, a, b) do { \ + mul2_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_2w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ + mul2_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_2w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_adx(a) do { \ + sqr2_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_2w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_bmi2(a) do { \ + sqr2_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_2w_bmi2(a, m.buffer); \ +} while (0) + +#define sqrn_eltfp25519_1w_adx(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_adx(a); \ +} while (0) + +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_bmi2(a); \ +} while (0) + +#define copy_eltfp25519_1w(C, A) do { \ + (C)[0] = (A)[0]; \ + (C)[1] = (A)[1]; \ + (C)[2] = (A)[2]; \ + (C)[3] = (A)[3]; \ +} while (0) + +#define setzero_eltfp25519_1w(C) do { \ + (C)[0] = 0; \ + (C)[1] = 0; \ + (C)[2] = 0; \ + (C)[3] = 0; \ +} while (0) + +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, + 0xffffffffffffffffUL, 0x5fffffffffffffffUL, + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, + 0xc1c20d06231f7614UL, 0x2938218da274f972UL, + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, + 0x23758739f630a257UL, 0x295a407a01a78580UL, + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, + 0xc189218075e91436UL, 0x6d9284169b3b8484UL, + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, + 0x25232973322dbef4UL, 0x445dc4758c17f770UL, + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, + 0x894d1d855ae52359UL, 0x68e122157b743d69UL, + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, + 0x4a497962066e6043UL, 0x705b3aab41355b44UL, + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, + 0x508e862f121692fcUL, 0x3a81907fa093c291UL, + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, + 0xe488de11d761e352UL, 0x0e878a01a085545cUL, + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, + 0x266fd5809208f294UL, 0x5c847085619a26b9UL, + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, + 0x52d17436309d4253UL, 0x356f97e13efae576UL, + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, + 0x497d723f802e88e1UL, 0x30684dea602f408dUL, + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, + 0x81004b71e33cc191UL, 0x44e6be345122803cUL, + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, + 0x33979624f0e917beUL, 0x2c018dc527356b30UL, + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL +}; + +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] + * a is two 256-bit integers: a0[0:3] and a1[4:7] + * b is two 256-bit integers: b0[0:3] and b1[4:7] + */ +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "xorl %%r14d, %%r14d ;" + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "adox %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adox %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, 64(%0);" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "adox %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adox %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 72(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 80(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 88(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "movq %%r8, 64(%0) ;" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "addq %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adcq %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 72(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 80(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 88(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ + "xorl %%r15d, %%r15d;" + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ + "adcx %%r14, %%r9 ;" + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ + "adcx %%rax, %%r10 ;" + "movq 56(%1), %%rdx ;" /* B[3] */ + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ + "adcx %%rax, %%rbx ;" + "movq 40(%1), %%rdx ;" /* B[1] */ + "adcx %%r15, %%r13 ;" + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq 32(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + "movq 40(%1), %%rdx ;" /* B[1] */ + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ + + "movq 48(%1), %%rdx ;" /* B[2] */ + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox 64(%1), %%r8 ;" + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 72(%1), %%r9 ;" + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 80(%1), %%r10 ;" + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 88(%1), %%r11 ;" + /****************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /****************************************/ + "adcq $0, %%rcx ;" + "addq 64(%1), %%r8 ;" + "adcq 72(%1), %%r9 ;" + "adcq 80(%1), %%r10 ;" + "adcq 88(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ + "adox %%r9, %%r10 ;" + "movq %%r10, 8(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ + "adox %%r11, %%r15 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ + "adox %%r13, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 8(%0), %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 16(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 16(%0), %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 24(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 24(%0), %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 32(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq %%r14, 48(%0) ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + "movq %%rax, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15"); +} + +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "xorl %%ecx, %%ecx ;" + "movq (%2), %%r8 ;" + "adcx (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcx 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcx 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcx 24(%1), %%r11 ;" + "cmovc %%eax, %%ecx ;" + "xorl %%eax, %%eax ;" + "adcx %%rcx, %%r8 ;" + "adcx %%rax, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rax, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rax, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $38, %%ecx ;" + "cmovc %%ecx, %%eax ;" + "addq %%rax, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%2), %%r8 ;" + "addq (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcq 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcq 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcq 24(%1), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%1), %%r8 ;" + "subq (%2), %%r8 ;" + "movq 8(%1), %%r9 ;" + "sbbq 8(%2), %%r9 ;" + "movq 16(%1), %%r10 ;" + "sbbq 16(%2), %%r10 ;" + "movq 24(%1), %%r11 ;" + "sbbq 24(%2), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "sbbq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "sbbq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "sbbq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ +static __always_inline void +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) +{ + const u64 a24 = 121666; + asm volatile( + "movq %2, %%rdx ;" + "mulx (%1), %%r8, %%r10 ;" + "mulx 8(%1), %%r9, %%r11 ;" + "addq %%r10, %%r9 ;" + "mulx 16(%1), %%r10, %%rax ;" + "adcq %%r11, %%r10 ;" + "mulx 24(%1), %%r11, %%rcx ;" + "adcq %%rax, %%r11 ;" + /**************************/ + "adcq $0, %%rcx ;" + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ + "imul %%rdx, %%rcx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(a24) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[4]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_adx(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 2); + mul_eltfp25519_1w_adx(T[0], a, T[2]); + mul_eltfp25519_1w_adx(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 1); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 10); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 20); + mul_eltfp25519_1w_adx(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 10); + mul_eltfp25519_1w_adx(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_adx(T[0], 50); + mul_eltfp25519_1w_adx(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 100); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 50); + mul_eltfp25519_1w_adx(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[5]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_bmi2(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 2); + mul_eltfp25519_1w_bmi2(T[0], a, T[2]); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 1); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 10); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 20); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 10); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_bmi2(T[0], 50); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 100); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 50); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c + * with a number such that 0 <= C < 2**255-19. + */ +static __always_inline void fred_eltfp25519_1w(u64 *const c) +{ + u64 tmp0 = 38, tmp1 = 19; + asm volatile( + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ + + /* Add either 19 or 38 to c */ + "addq %4, %0 ;" + "adcq $0, %1 ;" + "adcq $0, %2 ;" + "adcq $0, %3 ;" + + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ + "movl $0, %k4 ;" + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ + "btrq $63, %3 ;" /* Clear bit 255 */ + + /* Subtract 19 if necessary */ + "subq %4, %0 ;" + "sbbq $0, %1 ;" + "sbbq $0, %2 ;" + "sbbq $0, %3 ;" + + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), + "+r"(tmp1) + : + : "memory", "cc"); +} + +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) +{ + u64 temp; + asm volatile( + "test %9, %9 ;" + "movq %0, %8 ;" + "cmovnzq %4, %0 ;" + "cmovnzq %8, %4 ;" + "movq %1, %8 ;" + "cmovnzq %5, %1 ;" + "cmovnzq %8, %5 ;" + "movq %2, %8 ;" + "cmovnzq %6, %2 ;" + "cmovnzq %8, %6 ;" + "movq %3, %8 ;" + "cmovnzq %7, %3 ;" + "cmovnzq %8, %7 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), + "=r"(temp) + : "r"(bit) + : "cc" + ); +} + +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) +{ + asm volatile( + "test %4, %4 ;" + "cmovnzq %5, %0 ;" + "cmovnzq %6, %1 ;" + "cmovnzq %7, %2 ;" + "cmovnzq %8, %3 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) + : "cc" + ); +} + +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_adx(A, Qz); + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_adx(A, Zr1); + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_bmi2(A, Qz); + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_bmi2(A, Zr1); + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx(mypublic, secret, basepoint); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx_base(pub, secret); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2_base(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-x86", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + +static int __init curve25519_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2)) + static_branch_enable(&curve25519_use_bmi2); + else if (boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_adx); + else + return 0; + return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? + crypto_register_kpp(&curve25519_alg) : 0; +} + +static void __exit curve25519_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && + (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX))) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index 7fca43099a5f..fac0fdc3f25d 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -162,7 +162,7 @@ movl left##d, (io); \ movl right##d, 4(io); -ENTRY(des3_ede_x86_64_crypt_blk) +SYM_FUNC_START(des3_ede_x86_64_crypt_blk) /* input: * %rdi: round keys, CTX * %rsi: dst @@ -244,7 +244,7 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %rbx; ret; -ENDPROC(des3_ede_x86_64_crypt_blk) +SYM_FUNC_END(des3_ede_x86_64_crypt_blk) /*********************************************************************** * 3-way 3DES @@ -418,7 +418,7 @@ ENDPROC(des3_ede_x86_64_crypt_blk) #define __movq(src, dst) \ movq src, dst; -ENTRY(des3_ede_x86_64_crypt_blk_3way) +SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way) /* input: * %rdi: ctx, round keys * %rsi: dst (3 blocks) @@ -529,7 +529,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %rbx; ret; -ENDPROC(des3_ede_x86_64_crypt_blk_3way) +SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way) .section .rodata, "a", @progbits .align 16 diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 968386c21ef4..89830e531350 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -19,8 +19,8 @@ #include <linux/types.h> struct des3_ede_x86_ctx { - u32 enc_expkey[DES3_EDE_EXPKEY_WORDS]; - u32 dec_expkey[DES3_EDE_EXPKEY_WORDS]; + struct des3_ede_ctx enc; + struct des3_ede_ctx dec; }; /* regular block cipher functions */ @@ -34,7 +34,7 @@ asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *enc_ctx = ctx->enc_expkey; + u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); } @@ -42,7 +42,7 @@ static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *dec_ctx = ctx->dec_expkey; + u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } @@ -50,7 +50,7 @@ static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *enc_ctx = ctx->enc_expkey; + u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src); } @@ -58,7 +58,7 @@ static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *dec_ctx = ctx->dec_expkey; + u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); } @@ -122,7 +122,7 @@ static int ecb_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - return ecb_crypt(req, ctx->enc_expkey); + return ecb_crypt(req, ctx->enc.expkey); } static int ecb_decrypt(struct skcipher_request *req) @@ -130,7 +130,7 @@ static int ecb_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - return ecb_crypt(req, ctx->dec_expkey); + return ecb_crypt(req, ctx->dec.expkey); } static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, @@ -348,20 +348,28 @@ static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, u32 i, j, tmp; int err; - /* Generate encryption context using generic implementation. */ - err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen); - if (err < 0) + err = des3_ede_expand_key(&ctx->enc, key, keylen); + if (err == -ENOKEY) { + if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) + err = -EINVAL; + else + err = 0; + } + + if (err) { + memset(ctx, 0, sizeof(*ctx)); return err; + } /* Fix encryption context for this implementation and form decryption * context. */ j = DES3_EDE_EXPKEY_WORDS - 2; for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { - tmp = ror32(ctx->enc_expkey[i + 1], 4); - ctx->enc_expkey[i + 1] = tmp; + tmp = ror32(ctx->enc.expkey[i + 1], 4); + ctx->enc.expkey[i + 1] = tmp; - ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0]; - ctx->dec_expkey[j + 1] = tmp; + ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0]; + ctx->dec.expkey[j + 1] = tmp; } return 0; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 5d53effe8abe..bb9735fbb865 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -44,7 +44,7 @@ * T2 * T3 */ -__clmul_gf128mul_ble: +SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) movaps DATA, T1 pshufd $0b01001110, DATA, T2 pshufd $0b01001110, SHASH, T3 @@ -87,10 +87,10 @@ __clmul_gf128mul_ble: pxor T2, T1 pxor T1, DATA ret -ENDPROC(__clmul_gf128mul_ble) +SYM_FUNC_END(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const u128 *shash) */ -ENTRY(clmul_ghash_mul) +SYM_FUNC_START(clmul_ghash_mul) FRAME_BEGIN movups (%rdi), DATA movups (%rsi), SHASH @@ -101,13 +101,13 @@ ENTRY(clmul_ghash_mul) movups DATA, (%rdi) FRAME_END ret -ENDPROC(clmul_ghash_mul) +SYM_FUNC_END(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, * const u128 *shash); */ -ENTRY(clmul_ghash_update) +SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN cmp $16, %rdx jb .Lupdate_just_ret # check length @@ -130,4 +130,4 @@ ENTRY(clmul_ghash_update) .Lupdate_just_ret: FRAME_END ret -ENDPROC(clmul_ghash_update) +SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index ac76fe88ac4f..a4b728518e28 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -57,10 +57,8 @@ static int ghash_setkey(struct crypto_shash *tfm, be128 *x = (be128 *)key; u64 a, b; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; - } /* perform multiplication by 'x' in GF(2^128) */ a = be64_to_cpu(x->a); @@ -257,16 +255,11 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, { struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_ahash *child = &ctx->cryptd_tfm->base; - int err; crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - err = crypto_ahash_setkey(child, key, keylen); - crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) - & CRYPTO_TFM_RES_MASK); - - return err; + return crypto_ahash_setkey(child, key, keylen); } static int ghash_async_init_tfm(struct crypto_tfm *tfm) @@ -357,6 +350,5 @@ module_init(ghash_pclmulqdqni_mod_init); module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " - "accelerated by PCLMULQDQ-NI"); +MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI"); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 901551445387..d3d91a0abf88 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -14,6 +14,7 @@ #include <crypto/b128ops.h> #include <crypto/gf128mul.h> #include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> @@ -133,7 +134,8 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, src -= num_blocks - 1; dst -= num_blocks - 1; - gctx->funcs[i].fn_u.cbc(ctx, dst, src); + gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst, + (const u8 *)src); nbytes -= func_bytes; if (nbytes < bsize) @@ -187,7 +189,9 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, /* Process multi-block batch */ do { - gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst, + (const u8 *)src, + &ctrblk); src += num_blocks; dst += num_blocks; nbytes -= func_bytes; @@ -209,7 +213,8 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, be128_to_le128(&ctrblk, (be128 *)walk.iv); memcpy(&tmp, walk.src.virt.addr, nbytes); - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, &tmp, &tmp, + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp, + (const u8 *)&tmp, &ctrblk); memcpy(walk.dst.virt.addr, &tmp, nbytes); le128_to_be128((be128 *)walk.iv, &ctrblk); @@ -239,7 +244,8 @@ static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, if (nbytes >= func_bytes) { do { - gctx->funcs[i].fn_u.xts(ctx, dst, src, + gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst, + (const u8 *)src, walk->iv); src += num_blocks; @@ -259,17 +265,36 @@ done: int glue_xts_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req, common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx) + void *crypt_ctx, bool decrypt) { + const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); const unsigned int bsize = 128 / 8; + struct skcipher_request subreq; struct skcipher_walk walk; bool fpu_enabled = false; - unsigned int nbytes; + unsigned int nbytes, tail; int err; + if (req->cryptlen < XTS_BLOCK_SIZE) + return -EINVAL; + + if (unlikely(cts)) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + + tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + crypto_skcipher_get_flags(tfm), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + req->cryptlen - tail, req->iv); + req = &subreq; + } + err = skcipher_walk_virt(&walk, req, false); nbytes = walk.nbytes; - if (!nbytes) + if (err) return err; /* set minimum length to bsize, for tweak_fn */ @@ -287,14 +312,55 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, nbytes = walk.nbytes; } + if (unlikely(cts)) { + u8 *next_tweak, *final_tweak = req->iv; + struct scatterlist *src, *dst; + struct scatterlist s[2], d[2]; + le128 b[2]; + + dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(d, req->dst, req->cryptlen); + + if (decrypt) { + next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE); + gf128mul_x_ble(b, b); + } else { + next_tweak = req->iv; + } + + skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE, + next_tweak); + + err = skcipher_walk_virt(&walk, req, false) ?: + skcipher_walk_done(&walk, + __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); + if (err) + goto out; + + scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0); + memcpy(b + 1, b, tail - XTS_BLOCK_SIZE); + scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE, + tail - XTS_BLOCK_SIZE, 0); + scatterwalk_map_and_copy(b, dst, 0, tail, 1); + + skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE, + final_tweak); + + err = skcipher_walk_virt(&walk, req, false) ?: + skcipher_walk_done(&walk, + __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); + } + +out: glue_fpu_end(fpu_enabled); return err; } EXPORT_SYMBOL_GPL(glue_xts_req_128bit); -void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, - common_glue_func_t fn) +void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src, + le128 *iv, common_glue_func_t fn) { le128 ivblk = *iv; @@ -302,13 +368,13 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, gf128mul_x_ble(iv, &ivblk); /* CC <- T xor C */ - u128_xor(dst, src, (u128 *)&ivblk); + u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk); /* PP <- D(Key2,CC) */ - fn(ctx, (u8 *)dst, (u8 *)dst); + fn(ctx, dst, dst); /* P <- T xor PP */ - u128_xor(dst, dst, (u128 *)&ivblk); + u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk); } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S deleted file mode 100644 index 5413fee33481..000000000000 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ /dev/null @@ -1,619 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AVX2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %ymm0 -#define STATE0_LOW %xmm0 -#define STATE1 %ymm1 -#define STATE2 %ymm2 -#define STATE3 %ymm3 -#define STATE4 %ymm4 -#define KEY %ymm5 -#define MSG %ymm5 -#define MSG_LOW %xmm5 -#define T0 %ymm6 -#define T0_LOW %xmm6 -#define T1 %ymm7 - -.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 -.align 32 -.Lmorus1280_const: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 -.align 32 -.Lmorus1280_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro morus1280_round s0, s1, s2, s3, s4, b, w - vpand \s1, \s2, T0 - vpxor T0, \s0, \s0 - vpxor \s3, \s0, \s0 - vpsllq $\b, \s0, T0 - vpsrlq $(64 - \b), \s0, \s0 - vpxor T0, \s0, \s0 - vpermq $\w, \s3, \s3 -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - vpxor MSG, STATE1, STATE1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - vpxor MSG, STATE2, STATE2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - vpxor MSG, STATE3, STATE3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - vpxor MSG, STATE4, STATE4 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - vpxor MSG, MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LOW - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pshufd $MASK2, MSG_LOW, MSG_LOW - pinsrq $0, (%r8), MSG_LOW - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - vpermq $MASK2, MSG, MSG - movdqu (%rsi), MSG_LOW - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LOW, (%r9) - vpermq $MASK2, T0, T0 - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LOW, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - pextrq $1, T0_LOW, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_avx2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_avx2_init) - FRAME_BEGIN - - /* load IV: */ - vpxor STATE0, STATE0, STATE0 - movdqu (%rdx), STATE0_LOW - /* load key: */ - vmovdqu (%rsi), KEY - vmovdqa KEY, STATE1 - /* load all ones: */ - vpcmpeqd STATE2, STATE2, STATE2 - /* load all zeros: */ - vpxor STATE3, STATE3, STATE3 - /* load the constant: */ - vmovdqa .Lmorus1280_const, STATE4 - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - vpxor KEY, STATE1, STATE1 - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_init) - -/* - * void crypto_morus1280_avx2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - and $0x1F, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - vmovdqa (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - vmovdqu (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_ad) - -/* - * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - vmovdqa (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqa T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - vmovdqu (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqu T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc) - -/* - * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - - call __store_partial - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc_tail) - -/* - * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - vmovdqa (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - vmovdqu (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec) - -/* - * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LOW - vpbroadcastb T0_LOW, T0 - vmovdqa .Lmorus1280_counter, T1 - vpcmpgtb T1, T0, T0 - vpand T0, MSG, MSG - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec_tail) - -/* - * void crypto_morus1280_avx2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_avx2_final) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - vpxor STATE0, STATE4, STATE4 - - /* prepare length block: */ - vpxor MSG, MSG, MSG - vpinsrq $0, %rdx, MSG_LOW, MSG_LOW - vpinsrq $1, %rcx, MSG_LOW, MSG_LOW - vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - vmovdqu (%rsi), MSG - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c deleted file mode 100644 index 2d000d66ba4c..000000000000 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for AVX2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus1280_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_avx2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_AVX2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_avx2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_avx2_module_init); -module_exit(crypto_morus1280_avx2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S deleted file mode 100644 index 0eece772866b..000000000000 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ /dev/null @@ -1,893 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) - -#define STATE0_LO %xmm0 -#define STATE0_HI %xmm1 -#define STATE1_LO %xmm2 -#define STATE1_HI %xmm3 -#define STATE2_LO %xmm4 -#define STATE2_HI %xmm5 -#define STATE3_LO %xmm6 -#define STATE3_HI %xmm7 -#define STATE4_LO %xmm8 -#define STATE4_HI %xmm9 -#define KEY_LO %xmm10 -#define KEY_HI %xmm11 -#define MSG_LO %xmm10 -#define MSG_HI %xmm11 -#define T0_LO %xmm12 -#define T0_HI %xmm13 -#define T1_LO %xmm14 -#define T1_HI %xmm15 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 16 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter_0: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -.Lmorus640_counter_1: - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro rol1 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * HI_0 | LO_1 || LO_0 | HI_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \hi, T0_LO - punpcklqdq \lo, T0_LO - punpckhqdq \hi, \lo - movdqa \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro rol2 hi, lo - movdqa \lo, T0_LO - movdqa \hi, \lo - movdqa T0_LO, \hi -.endm - -.macro rol3 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * LO_0 | HI_1 || HI_0 | LO_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \lo, T0_LO - punpckhqdq \hi, T0_LO - punpcklqdq \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w - movdqa \s1_l, T0_LO - pand \s2_l, T0_LO - pxor T0_LO, \s0_l - - movdqa \s1_h, T0_LO - pand \s2_h, T0_LO - pxor T0_LO, \s0_h - - pxor \s3_l, \s0_l - pxor \s3_h, \s0_h - - movdqa \s0_l, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_l - pxor T0_LO, \s0_l - - movdqa \s0_h, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_h - pxor T0_LO, \s0_h - - \w \s3_h, \s3_l -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - pxor MSG_LO, STATE1_LO - pxor MSG_HI, STATE1_HI - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - pxor MSG_LO, STATE2_LO - pxor MSG_HI, STATE2_HI - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - pxor MSG_LO, STATE3_LO - pxor MSG_HI, STATE3_HI - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - pxor MSG_LO, STATE4_LO - pxor MSG_HI, STATE4_HI - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG_LO, MSG_LO - pxor MSG_HI, MSG_HI - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LO - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG_LO - movq (%r8), T0_LO - pxor T0_LO, MSG_LO - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - movdqa MSG_LO, MSG_HI - movdqu (%rsi), MSG_LO - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LO, (%r9) - movdqa T0_HI, T0_LO - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LO, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0_LO - movq T0_LO, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_sse2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_sse2_init) - FRAME_BEGIN - - /* load IV: */ - pxor STATE0_HI, STATE0_HI - movdqu (%rdx), STATE0_LO - /* load key: */ - movdqu 0(%rsi), KEY_LO - movdqu 16(%rsi), KEY_HI - movdqa KEY_LO, STATE1_LO - movdqa KEY_HI, STATE1_HI - /* load all ones: */ - pcmpeqd STATE2_LO, STATE2_LO - pcmpeqd STATE2_HI, STATE2_HI - /* load all zeros: */ - pxor STATE3_LO, STATE3_LO - pxor STATE3_HI, STATE3_HI - /* load the constant: */ - movdqa .Lmorus640_const_0, STATE4_LO - movdqa .Lmorus640_const_1, STATE4_HI - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - pxor KEY_LO, STATE1_LO - pxor KEY_HI, STATE1_HI - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_init) - -/* - * void crypto_morus1280_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_ad) - -/* - * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqa T0_LO, 0(%rdx) - movdqa T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqu T0_LO, 0(%rdx) - movdqu T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc) - -/* - * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* encrypt message: */ - call __load_partial - - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - - call __store_partial - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc_tail) - -/* - * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, 0(%rdx) - movdqa MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqu MSG_LO, 0(%rdx) - movdqu MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec) - -/* - * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* decrypt message: */ - call __load_partial - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - movdqa T0_LO, T0_HI - movdqa .Lmorus640_counter_0, T1_LO - movdqa .Lmorus640_counter_1, T1_HI - pcmpgtb T1_LO, T0_LO - pcmpgtb T1_HI, T0_HI - pand T0_LO, MSG_LO - pand T0_HI, MSG_HI - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec_tail) - -/* - * void crypto_morus1280_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* xor state[0] into state[4]: */ - pxor STATE0_LO, STATE4_LO - pxor STATE0_HI, STATE4_HI - - /* prepare length block: */ - movq %rdx, MSG_LO - movq %rcx, T0_LO - pslldq $8, T0_LO - pxor T0_LO, MSG_LO - psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ - pxor MSG_HI, MSG_HI - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T0_LO - movdqa STATE1_HI, T0_HI - rol3 T0_HI, T0_LO - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - movdqa STATE2_LO, T0_LO - movdqa STATE2_HI, T0_HI - pand STATE3_LO, T0_LO - pand STATE3_HI, T0_HI - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - - movdqu MSG_LO, 0(%rsi) - movdqu MSG_HI, 16(%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c deleted file mode 100644 index aada9d774293..000000000000 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus1280_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_sse2_module_init); -module_exit(crypto_morus1280_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c deleted file mode 100644 index ffbde8b22838..000000000000 --- a/arch/x86/crypto/morus1280_glue.c +++ /dev/null @@ -1,205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/skcipher.h> -#include <crypto/morus1280_glue.h> -#include <crypto/scatterwalk.h> -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/scatterlist.h> -#include <asm/fpu/api.h> - -struct morus1280_state { - struct morus1280_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus1280_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus1280_glue_process_ad( - struct morus1280_state *state, - const struct morus1280_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus1280_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS1280_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS1280_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS1280_BLOCK_SIZE - 1); - left &= MORUS1280_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - } -} - -static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, - struct morus1280_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS1280_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, - MORUS1280_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen == MORUS1280_BLOCK_SIZE) { - memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); - } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { - memcpy(ctx->key.bytes, key, keylen); - memcpy(ctx->key.bytes + keylen, key, keylen); - } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); - -int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); - -static void crypto_morus1280_glue_crypt(struct aead_request *req, - struct morus1280_ops ops, - unsigned int cryptlen, - struct morus1280_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus1280_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus1280_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); - -int crypto_morus1280_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus1280_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); - -void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, - const struct morus1280_glue_ops *ops) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S deleted file mode 100644 index a60891101bbd..000000000000 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ /dev/null @@ -1,612 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-640 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define KEY %xmm5 -#define MSG %xmm5 -#define T0 %xmm6 -#define T1 %xmm7 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 32 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - -.text - -.macro morus640_round s0, s1, s2, s3, s4, b, w - movdqa \s1, T0 - pand \s2, T0 - pxor T0, \s0 - pxor \s3, \s0 - movdqa \s0, T0 - pslld $\b, T0 - psrld $(32 - \b), \s0 - pxor T0, \s0 - pshufd $\w, \s3, \s3 -.endm - -/* - * __morus640_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - pxor MSG, STATE1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - pxor MSG, STATE2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - pxor MSG, STATE3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - pxor MSG, STATE4 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update) - - -/* - * __morus640_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update_zero: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_morus640_sse2_init) - FRAME_BEGIN - - /* load IV: */ - movdqu (%rdx), STATE0 - /* load key: */ - movdqu (%rsi), KEY - movdqa KEY, STATE1 - /* load all ones: */ - pcmpeqd STATE2, STATE2 - /* load the constants: */ - movdqa .Lmorus640_const_0, STATE3 - movdqa .Lmorus640_const_1, STATE4 - - /* update 16 times with zero: */ - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - - /* xor-in the key again after updates: */ - pxor KEY, STATE1 - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_init) - -/* - * void crypto_morus640_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_ad) - FRAME_BEGIN - - cmp $16, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_ad) - -/* - * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc) - FRAME_BEGIN - - cmp $16, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqa T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqu T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc) - -/* - * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - - call __store_partial - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc_tail) - -/* - * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec) - FRAME_BEGIN - - cmp $16, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqu MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec) - -/* - * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Lmorus640_counter, T1 - pcmpgtb T1, T0 - pand T0, MSG - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec_tail) - -/* - * void crypto_morus640_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus640_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - pxor STATE0, STATE4 - - /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG - psllq $3, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - - /* xor tag: */ - movdqu (%rsi), MSG - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - - movdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c deleted file mode 100644 index 8ef68134aef4..000000000000 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus640_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus640_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus640_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus640_sse2_module_init); -module_exit(crypto_morus640_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus640"); -MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c deleted file mode 100644 index d8b5fd6cef29..000000000000 --- a/arch/x86/crypto/morus640_glue.c +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/skcipher.h> -#include <crypto/morus640_glue.h> -#include <crypto/scatterwalk.h> -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/scatterlist.h> -#include <asm/fpu/api.h> - -struct morus640_state { - struct morus640_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus640_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus640_glue_process_ad( - struct morus640_state *state, - const struct morus640_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus640_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS640_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS640_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS640_BLOCK_SIZE - 1); - left &= MORUS640_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - } -} - -static void crypto_morus640_glue_process_crypt(struct morus640_state *state, - struct morus640_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS640_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, MORUS640_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen != MORUS640_BLOCK_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); - -int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); - -static void crypto_morus640_glue_crypt(struct aead_request *req, - struct morus640_ops ops, - unsigned int cryptlen, - struct morus640_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus640_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus640_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); - -int crypto_morus640_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus640_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); - -void crypto_morus640_glue_init_ops(struct crypto_aead *aead, - const struct morus640_glue_ops *ops) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index f7946ea1b704..b22c7b936272 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -69,7 +69,7 @@ * * It's guaranteed that message_len % 16 == 0. */ -ENTRY(nh_avx2) +SYM_FUNC_START(nh_avx2) vmovdqu 0x00(KEY), K0 vmovdqu 0x10(KEY), K1 @@ -154,4 +154,4 @@ ENTRY(nh_avx2) vpaddq T4, T0, T0 vmovdqu T0, (HASH) ret -ENDPROC(nh_avx2) +SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index 51f52d4ab4bb..d7ae22dd6683 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -71,7 +71,7 @@ * * It's guaranteed that message_len % 16 == 0. */ -ENTRY(nh_sse2) +SYM_FUNC_START(nh_sse2) movdqu 0x00(KEY), K0 movdqu 0x10(KEY), K1 @@ -120,4 +120,4 @@ ENTRY(nh_sse2) movdqu T0, 0x00(HASH) movdqu T1, 0x10(HASH) ret -ENDPROC(nh_sse2) +SYM_FUNC_END(nh_sse2) diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S deleted file mode 100644 index 8b341bc29d41..000000000000 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ /dev/null @@ -1,390 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ANMASK, "aM", @progbits, 32 -.align 32 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst32.ORMASK, "aM", @progbits, 32 -.align 32 -ORMASK: .octa 0x00000000010000000000000001000000 - .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define w0 0x14(%r8) -#define w1 0x18(%r8) -#define w2 0x1c(%r8) -#define w3 0x20(%r8) -#define w4 0x24(%r8) -#define y0 0x28(%r8) -#define y1 0x2c(%r8) -#define y2 0x30(%r8) -#define y3 0x34(%r8) -#define y4 0x38(%r8) -#define m %rsi -#define hc0 %ymm0 -#define hc1 %ymm1 -#define hc2 %ymm2 -#define hc3 %ymm3 -#define hc4 %ymm4 -#define hc0x %xmm0 -#define hc1x %xmm1 -#define hc2x %xmm2 -#define hc3x %xmm3 -#define hc4x %xmm4 -#define t1 %ymm5 -#define t2 %ymm6 -#define t1x %xmm5 -#define t2x %xmm6 -#define ruwy0 %ymm7 -#define ruwy1 %ymm8 -#define ruwy2 %ymm9 -#define ruwy3 %ymm10 -#define ruwy4 %ymm11 -#define ruwy0x %xmm7 -#define ruwy1x %xmm8 -#define ruwy2x %xmm9 -#define ruwy3x %xmm10 -#define ruwy4x %xmm11 -#define svxz1 %ymm12 -#define svxz2 %ymm13 -#define svxz3 %ymm14 -#define svxz4 %ymm15 -#define d0 %r9 -#define d1 %r10 -#define d2 %r11 -#define d3 %r12 -#define d4 %r13 - -ENTRY(poly1305_4block_avx2) - # %rdi: Accumulator h[5] - # %rsi: 64 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Quadblock count - # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], - - # This four-block variant uses loop unrolled block processing. It - # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: - # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r - - vzeroupper - push %rbx - push %r12 - push %r13 - - # combine r0,u0,w0,y0 - vmovd y0,ruwy0x - vmovd w0,t1x - vpunpcklqdq t1,ruwy0,ruwy0 - vmovd u0,t1x - vmovd r0,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy0,ruwy0 - - # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 - vmovd y1,ruwy1x - vmovd w1,t1x - vpunpcklqdq t1,ruwy1,ruwy1 - vmovd u1,t1x - vmovd r1,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy1,ruwy1 - vpslld $2,ruwy1,svxz1 - vpaddd ruwy1,svxz1,svxz1 - - # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 - vmovd y2,ruwy2x - vmovd w2,t1x - vpunpcklqdq t1,ruwy2,ruwy2 - vmovd u2,t1x - vmovd r2,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy2,ruwy2 - vpslld $2,ruwy2,svxz2 - vpaddd ruwy2,svxz2,svxz2 - - # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 - vmovd y3,ruwy3x - vmovd w3,t1x - vpunpcklqdq t1,ruwy3,ruwy3 - vmovd u3,t1x - vmovd r3,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy3,ruwy3 - vpslld $2,ruwy3,svxz3 - vpaddd ruwy3,svxz3,svxz3 - - # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 - vmovd y4,ruwy4x - vmovd w4,t1x - vpunpcklqdq t1,ruwy4,ruwy4 - vmovd u4,t1x - vmovd r4,t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,ruwy4,ruwy4 - vpslld $2,ruwy4,svxz4 - vpaddd ruwy4,svxz4,svxz4 - -.Ldoblock4: - # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, - # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] - vmovd 0x00(m),hc0x - vmovd 0x10(m),t1x - vpunpcklqdq t1,hc0,hc0 - vmovd 0x20(m),t1x - vmovd 0x30(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc0,hc0 - vpand ANMASK(%rip),hc0,hc0 - vmovd h0,t1x - vpaddd t1,hc0,hc0 - # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, - # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] - vmovd 0x03(m),hc1x - vmovd 0x13(m),t1x - vpunpcklqdq t1,hc1,hc1 - vmovd 0x23(m),t1x - vmovd 0x33(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc1,hc1 - vpsrld $2,hc1,hc1 - vpand ANMASK(%rip),hc1,hc1 - vmovd h1,t1x - vpaddd t1,hc1,hc1 - # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, - # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] - vmovd 0x06(m),hc2x - vmovd 0x16(m),t1x - vpunpcklqdq t1,hc2,hc2 - vmovd 0x26(m),t1x - vmovd 0x36(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc2,hc2 - vpsrld $4,hc2,hc2 - vpand ANMASK(%rip),hc2,hc2 - vmovd h2,t1x - vpaddd t1,hc2,hc2 - # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, - # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] - vmovd 0x09(m),hc3x - vmovd 0x19(m),t1x - vpunpcklqdq t1,hc3,hc3 - vmovd 0x29(m),t1x - vmovd 0x39(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc3,hc3 - vpsrld $6,hc3,hc3 - vpand ANMASK(%rip),hc3,hc3 - vmovd h3,t1x - vpaddd t1,hc3,hc3 - # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), - # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] - vmovd 0x0c(m),hc4x - vmovd 0x1c(m),t1x - vpunpcklqdq t1,hc4,hc4 - vmovd 0x2c(m),t1x - vmovd 0x3c(m),t2x - vpunpcklqdq t2,t1,t1 - vperm2i128 $0x20,t1,hc4,hc4 - vpsrld $8,hc4,hc4 - vpor ORMASK(%rip),hc4,hc4 - vmovd h4,t1x - vpaddd t1,hc4,hc4 - - # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] - vpmuludq hc0,ruwy0,t1 - # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] - vpmuludq hc1,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] - vpmuludq hc2,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] - vpmuludq hc3,svxz2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] - vpmuludq hc4,svxz1,t2 - vpaddq t2,t1,t1 - # d0 = t1[0] + t1[1] + t[2] + t[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d0 - - # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] - vpmuludq hc0,ruwy1,t1 - # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] - vpmuludq hc1,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] - vpmuludq hc2,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] - vpmuludq hc3,svxz3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] - vpmuludq hc4,svxz2,t2 - vpaddq t2,t1,t1 - # d1 = t1[0] + t1[1] + t1[3] + t1[4] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d1 - - # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] - vpmuludq hc0,ruwy2,t1 - # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] - vpmuludq hc1,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] - vpmuludq hc2,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] - vpmuludq hc3,svxz4,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] - vpmuludq hc4,svxz3,t2 - vpaddq t2,t1,t1 - # d2 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d2 - - # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] - vpmuludq hc0,ruwy3,t1 - # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] - vpmuludq hc1,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] - vpmuludq hc2,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] - vpmuludq hc3,ruwy0,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] - vpmuludq hc4,svxz4,t2 - vpaddq t2,t1,t1 - # d3 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d3 - - # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] - vpmuludq hc0,ruwy4,t1 - # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] - vpmuludq hc1,ruwy3,t2 - vpaddq t2,t1,t1 - # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] - vpmuludq hc2,ruwy2,t2 - vpaddq t2,t1,t1 - # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] - vpmuludq hc3,ruwy1,t2 - vpaddq t2,t1,t1 - # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] - vpmuludq hc4,ruwy0,t2 - vpaddq t2,t1,t1 - # d4 = t1[0] + t1[1] + t1[2] + t1[3] - vpermq $0xee,t1,t2 - vpaddq t2,t1,t1 - vpsrldq $8,t1,t2 - vpaddq t2,t1,t1 - vmovq t1x,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x40,m - dec %rcx - jnz .Ldoblock4 - - vzeroupper - pop %r13 - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_4block_avx2) diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S deleted file mode 100644 index 5578f846e622..000000000000 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ /dev/null @@ -1,590 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst16.ANMASK, "aM", @progbits, 16 -.align 16 -ANMASK: .octa 0x0000000003ffffff0000000003ffffff - -.section .rodata.cst16.ORMASK, "aM", @progbits, 16 -.align 16 -ORMASK: .octa 0x00000000010000000000000001000000 - -.text - -#define h0 0x00(%rdi) -#define h1 0x04(%rdi) -#define h2 0x08(%rdi) -#define h3 0x0c(%rdi) -#define h4 0x10(%rdi) -#define r0 0x00(%rdx) -#define r1 0x04(%rdx) -#define r2 0x08(%rdx) -#define r3 0x0c(%rdx) -#define r4 0x10(%rdx) -#define s1 0x00(%rsp) -#define s2 0x04(%rsp) -#define s3 0x08(%rsp) -#define s4 0x0c(%rsp) -#define m %rsi -#define h01 %xmm0 -#define h23 %xmm1 -#define h44 %xmm2 -#define t1 %xmm3 -#define t2 %xmm4 -#define t3 %xmm5 -#define t4 %xmm6 -#define mask %xmm7 -#define d0 %r8 -#define d1 %r9 -#define d2 %r10 -#define d3 %r11 -#define d4 %r12 - -ENTRY(poly1305_block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Block count - - # This single block variant tries to improve performance by doing two - # multiplications in parallel using SSE instructions. There is quite - # some quardword packing involved, hence the speedup is marginal. - - push %rbx - push %r12 - sub $0x10,%rsp - - # s1..s4 = r1..r4 * 5 - mov r1,%eax - lea (%eax,%eax,4),%eax - mov %eax,s1 - mov r2,%eax - lea (%eax,%eax,4),%eax - mov %eax,s2 - mov r3,%eax - lea (%eax,%eax,4),%eax - mov %eax,s3 - mov r4,%eax - lea (%eax,%eax,4),%eax - mov %eax,s4 - - movdqa ANMASK(%rip),mask - -.Ldoblock: - # h01 = [0, h1, 0, h0] - # h23 = [0, h3, 0, h2] - # h44 = [0, h4, 0, h4] - movd h0,h01 - movd h1,t1 - movd h2,h23 - movd h3,t2 - movd h4,h44 - punpcklqdq t1,h01 - punpcklqdq t2,h23 - punpcklqdq h44,h44 - - # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] - movd 0x00(m),t1 - movd 0x03(m),t2 - psrld $2,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h01 - # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),t1 - movd 0x09(m),t2 - psrld $4,t1 - psrld $6,t2 - punpcklqdq t2,t1 - pand mask,t1 - paddd t1,h23 - # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] - mov 0x0c(m),%eax - shr $8,%eax - or $0x01000000,%eax - movd %eax,t1 - pshufd $0xc4,t1,t1 - paddd t1,h44 - - # t1[0] = h0 * r0 + h2 * s3 - # t1[1] = h1 * s4 + h3 * s2 - movd r0,t1 - movd s4,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd s3,t2 - movd s2,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r1 + h2 * s4 - # t2[1] = h1 * r0 + h3 * s3 - movd r1,t2 - movd r0,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd s4,t3 - movd s3,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s1 - # t3[1] = h4 * s2 - movd s1,t3 - movd s2,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d0 = t1[0] + t1[1] + t3[0] - # d1 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d0 - psrldq $8,t1 - movq t1,d1 - - # t1[0] = h0 * r2 + h2 * r0 - # t1[1] = h1 * r1 + h3 * s4 - movd r2,t1 - movd r1,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r0,t2 - movd s4,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t2[0] = h0 * r3 + h2 * r1 - # t2[1] = h1 * r2 + h3 * r0 - movd r3,t2 - movd r2,t3 - punpcklqdq t3,t2 - pmuludq h01,t2 - movd r1,t3 - movd r0,t4 - punpcklqdq t4,t3 - pmuludq h23,t3 - paddq t3,t2 - # t3[0] = h4 * s3 - # t3[1] = h4 * s4 - movd s3,t3 - movd s4,t4 - punpcklqdq t4,t3 - pmuludq h44,t3 - # d2 = t1[0] + t1[1] + t3[0] - # d3 = t2[0] + t2[1] + t3[1] - movdqa t1,t4 - punpcklqdq t2,t4 - punpckhqdq t2,t1 - paddq t4,t1 - paddq t3,t1 - movq t1,d2 - psrldq $8,t1 - movq t1,d3 - - # t1[0] = h0 * r4 + h2 * r2 - # t1[1] = h1 * r3 + h3 * r1 - movd r4,t1 - movd r3,t2 - punpcklqdq t2,t1 - pmuludq h01,t1 - movd r2,t2 - movd r1,t3 - punpcklqdq t3,t2 - pmuludq h23,t2 - paddq t2,t1 - # t3[0] = h4 * r0 - movd r0,t3 - pmuludq h44,t3 - # d4 = t1[0] + t1[1] + t3[0] - movdqa t1,t4 - psrldq $8,t4 - paddq t4,t1 - paddq t3,t1 - movq t1,d4 - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x10,m - dec %rcx - jnz .Ldoblock - - # Zeroing of key material - mov %rcx,0x00(%rsp) - mov %rcx,0x08(%rsp) - - add $0x10,%rsp - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_block_sse2) - - -#define u0 0x00(%r8) -#define u1 0x04(%r8) -#define u2 0x08(%r8) -#define u3 0x0c(%r8) -#define u4 0x10(%r8) -#define hc0 %xmm0 -#define hc1 %xmm1 -#define hc2 %xmm2 -#define hc3 %xmm5 -#define hc4 %xmm6 -#define ru0 %xmm7 -#define ru1 %xmm8 -#define ru2 %xmm9 -#define ru3 %xmm10 -#define ru4 %xmm11 -#define sv1 %xmm12 -#define sv2 %xmm13 -#define sv3 %xmm14 -#define sv4 %xmm15 -#undef d0 -#define d0 %r13 - -ENTRY(poly1305_2block_sse2) - # %rdi: Accumulator h[5] - # %rsi: 16 byte input block m - # %rdx: Poly1305 key r[5] - # %rcx: Doubleblock count - # %r8: Poly1305 derived key r^2 u[5] - - # This two-block variant further improves performance by using loop - # unrolled block processing. This is more straight forward and does - # less byte shuffling, but requires a second Poly1305 key r^2: - # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r - - push %rbx - push %r12 - push %r13 - - # combine r0,u0 - movd u0,ru0 - movd r0,t1 - punpcklqdq t1,ru0 - - # combine r1,u1 and s1=r1*5,v1=u1*5 - movd u1,ru1 - movd r1,t1 - punpcklqdq t1,ru1 - movdqa ru1,sv1 - pslld $2,sv1 - paddd ru1,sv1 - - # combine r2,u2 and s2=r2*5,v2=u2*5 - movd u2,ru2 - movd r2,t1 - punpcklqdq t1,ru2 - movdqa ru2,sv2 - pslld $2,sv2 - paddd ru2,sv2 - - # combine r3,u3 and s3=r3*5,v3=u3*5 - movd u3,ru3 - movd r3,t1 - punpcklqdq t1,ru3 - movdqa ru3,sv3 - pslld $2,sv3 - paddd ru3,sv3 - - # combine r4,u4 and s4=r4*5,v4=u4*5 - movd u4,ru4 - movd r4,t1 - punpcklqdq t1,ru4 - movdqa ru4,sv4 - pslld $2,sv4 - paddd ru4,sv4 - -.Ldoblock2: - # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] - movd 0x00(m),hc0 - movd 0x10(m),t1 - punpcklqdq t1,hc0 - pand ANMASK(%rip),hc0 - movd h0,t1 - paddd t1,hc0 - # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] - movd 0x03(m),hc1 - movd 0x13(m),t1 - punpcklqdq t1,hc1 - psrld $2,hc1 - pand ANMASK(%rip),hc1 - movd h1,t1 - paddd t1,hc1 - # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] - movd 0x06(m),hc2 - movd 0x16(m),t1 - punpcklqdq t1,hc2 - psrld $4,hc2 - pand ANMASK(%rip),hc2 - movd h2,t1 - paddd t1,hc2 - # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] - movd 0x09(m),hc3 - movd 0x19(m),t1 - punpcklqdq t1,hc3 - psrld $6,hc3 - pand ANMASK(%rip),hc3 - movd h3,t1 - paddd t1,hc3 - # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] - movd 0x0c(m),hc4 - movd 0x1c(m),t1 - punpcklqdq t1,hc4 - psrld $8,hc4 - por ORMASK(%rip),hc4 - movd h4,t1 - paddd t1,hc4 - - # t1 = [ hc0[1] * r0, hc0[0] * u0 ] - movdqa ru0,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * s4, hc1[0] * v4 ] - movdqa sv4,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s3, hc2[0] * v3 ] - movdqa sv3,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s2, hc3[0] * v2 ] - movdqa sv2,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s1, hc4[0] * v1 ] - movdqa sv1,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d0 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d0 - - # t1 = [ hc0[1] * r1, hc0[0] * u1 ] - movdqa ru1,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r0, hc1[0] * u0 ] - movdqa ru0,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * s4, hc2[0] * v4 ] - movdqa sv4,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s3, hc3[0] * v3 ] - movdqa sv3,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s2, hc4[0] * v2 ] - movdqa sv2,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d1 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d1 - - # t1 = [ hc0[1] * r2, hc0[0] * u2 ] - movdqa ru2,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r1, hc1[0] * u1 ] - movdqa ru1,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r0, hc2[0] * u0 ] - movdqa ru0,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * s4, hc3[0] * v4 ] - movdqa sv4,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s3, hc4[0] * v3 ] - movdqa sv3,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d2 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d2 - - # t1 = [ hc0[1] * r3, hc0[0] * u3 ] - movdqa ru3,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r2, hc1[0] * u2 ] - movdqa ru2,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r1, hc2[0] * u1 ] - movdqa ru1,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r0, hc3[0] * u0 ] - movdqa ru0,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * s4, hc4[0] * v4 ] - movdqa sv4,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d3 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d3 - - # t1 = [ hc0[1] * r4, hc0[0] * u4 ] - movdqa ru4,t1 - pmuludq hc0,t1 - # t1 += [ hc1[1] * r3, hc1[0] * u3 ] - movdqa ru3,t2 - pmuludq hc1,t2 - paddq t2,t1 - # t1 += [ hc2[1] * r2, hc2[0] * u2 ] - movdqa ru2,t2 - pmuludq hc2,t2 - paddq t2,t1 - # t1 += [ hc3[1] * r1, hc3[0] * u1 ] - movdqa ru1,t2 - pmuludq hc3,t2 - paddq t2,t1 - # t1 += [ hc4[1] * r0, hc4[0] * u0 ] - movdqa ru0,t2 - pmuludq hc4,t2 - paddq t2,t1 - # d4 = t1[0] + t1[1] - movdqa t1,t2 - psrldq $8,t2 - paddq t2,t1 - movq t1,d4 - - # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> - # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small - # amount. Careful: we must not assume the carry bits 'd0 >> 26', - # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit - # integers. It's true in a single-block implementation, but not here. - - # d1 += d0 >> 26 - mov d0,%rax - shr $26,%rax - add %rax,d1 - # h0 = d0 & 0x3ffffff - mov d0,%rbx - and $0x3ffffff,%ebx - - # d2 += d1 >> 26 - mov d1,%rax - shr $26,%rax - add %rax,d2 - # h1 = d1 & 0x3ffffff - mov d1,%rax - and $0x3ffffff,%eax - mov %eax,h1 - - # d3 += d2 >> 26 - mov d2,%rax - shr $26,%rax - add %rax,d3 - # h2 = d2 & 0x3ffffff - mov d2,%rax - and $0x3ffffff,%eax - mov %eax,h2 - - # d4 += d3 >> 26 - mov d3,%rax - shr $26,%rax - add %rax,d4 - # h3 = d3 & 0x3ffffff - mov d3,%rax - and $0x3ffffff,%eax - mov %eax,h3 - - # h0 += (d4 >> 26) * 5 - mov d4,%rax - shr $26,%rax - lea (%rax,%rax,4),%rax - add %rax,%rbx - # h4 = d4 & 0x3ffffff - mov d4,%rax - and $0x3ffffff,%eax - mov %eax,h4 - - # h1 += h0 >> 26 - mov %rbx,%rax - shr $26,%rax - add %eax,h1 - # h0 = h0 & 0x3ffffff - andl $0x3ffffff,%ebx - mov %ebx,h0 - - add $0x20,m - dec %rcx - jnz .Ldoblock2 - - pop %r13 - pop %r12 - pop %rbx - ret -ENDPROC(poly1305_2block_sse2) diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl new file mode 100644 index 000000000000..7a6b5380a46f --- /dev/null +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -0,0 +1,4265 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannonlake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); +$kernel=0; $kernel=1 if (!$flavour && !$output); + +if (!$kernel) { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); + } + + if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); + } + + if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} else { + $avx = 4; # The kernel uses ifdefs for this. +} + +sub declare_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= ".align $align\n"; + $code .= "SYM_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + +sub end_function() { + my ($name) = @_; + if($kernel) { + $code .= "SYM_FUNC_END($name)\n"; + } else { + $code .= ".size $name,.-$name\n"; + } +} + +$code.=<<___ if $kernel; +#include <linux/linkage.h> +___ + +if ($avx) { +$code.=<<___ if $kernel; +.section .rodata +___ +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___ if (!$kernel); +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 16 +___ + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); +my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text +___ +$code.=<<___ if (!$kernel); +.extern OPENSSL_ia32cap_P + +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 +.globl poly1305_blocks_x86_64 +.hidden poly1305_blocks_x86_64 +.globl poly1305_emit_x86_64 +.hidden poly1305_emit_x86_64 +___ +&declare_function("poly1305_init_x86_64", 32, 3); +$code.=<<___; + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + cmp \$0,$inp + je .Lno_key +___ +$code.=<<___ if (!$kernel); + lea poly1305_blocks_x86_64(%rip),%r10 + lea poly1305_emit_x86_64(%rip),%r11 +___ +$code.=<<___ if (!$kernel && $avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if (!$kernel && $avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if (!$kernel && $avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if (!$kernel && $flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if (!$kernel && $flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + ret +___ +&end_function("poly1305_init_x86_64"); + +&declare_function("poly1305_blocks_x86_64", 32, 4); +$code.=<<___; +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $ctx +.cfi_push $ctx +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + + &poly1305_iteration(); + +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov 0(%rsp),$ctx +.cfi_restore $ctx + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + ret +.cfi_endproc +___ +&end_function("poly1305_blocks_x86_64"); + +&declare_function("poly1305_emit_x86_64", 32, 3); +$code.=<<___; +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +___ +&end_function("poly1305_emit_x86_64"); +if ($avx) { + +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX\n"; +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: + push $ctx +___ + &poly1305_iteration(); +$code.=<<___; + pop $ctx + ret +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: + push %rbp + mov %rsp,%rbp + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + pop %rbp + ret +.size __poly1305_init_avx,.-__poly1305_init_avx +___ + +&declare_function("poly1305_blocks_avx", 32, 4); +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + and \$-32,%rsp + sub \$-8,%rsp + lea -0x58(%rsp),%r11 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +___ +&end_function("poly1305_blocks_avx"); + +&declare_function("poly1305_emit_avx", 32, 3); +$code.=<<___; + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +___ +&end_function("poly1305_emit_avx"); + +if ($kernel) { + $code .= "#endif\n"; +} + +if ($avx>1) { + +if ($kernel) { + $code .= "#ifdef CONFIG_AS_AVX2\n"; +} + +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +sub poly1305_blocks_avxN { + my ($avx512) = @_; + my $suffix = $avx512 ? "_avx512" : ""; +$code.=<<___; +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2$suffix + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2$suffix: + and \$-16,$len + jz .Lno_data_avx2$suffix + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2$suffix + + test \$63,$len + jz .Leven_avx2$suffix + + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2$suffix + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2$suffix + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2$suffix + +.align 32 +.Lstore_base2_64_avx2$suffix: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2$suffix + +.align 16 +.Lstore_base2_26_avx2$suffix: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2$suffix: + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lno_data_avx2$suffix: +.Lblocks_avx2_epilogue$suffix: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx2$suffix: +.cfi_startproc + push %rbp +.cfi_push %rbp + mov %rsp,%rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body$suffix: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2$suffix + +.Lbase2_64_pre_avx2$suffix: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2$suffix + +.Linit_avx2$suffix: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2$suffix: + mov %r15,$len # restore $len +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d + mov \$`(1<<31|1<<30|1<<16)`,%r11d +___ +$code.=<<___; + pop %r15 +.cfi_restore %r15 + pop %r14 +.cfi_restore %r14 + pop %r13 +.cfi_restore %r13 + pop %r12 +.cfi_restore %r12 + pop %rbx +.cfi_restore %rbx + pop %rbp +.cfi_restore %rbp +.Lbase2_64_avx2_epilogue$suffix: + jmp .Ldo_avx2$suffix +.cfi_endproc + +.align 32 +.Leven_avx2$suffix: +.cfi_startproc +___ +$code.=<<___ if (!$kernel); + mov OPENSSL_ia32cap_P+8(%rip),%r9d +___ +$code.=<<___; + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2$suffix: +___ +$code.=<<___ if (!$kernel && $avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r9d + test \$`1<<16`,%r9d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512$suffix: +___ +$code.=<<___ if ($avx > 2 && $avx512 && $kernel); + cmp \$512,$len + jae .Lblocks_avx512 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx2_body$suffix: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2$suffix + jmp .Loop_avx2$suffix + +.align 32 +.Loop_avx2$suffix: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2$suffix + + .byte 0x66,0x90 +.Ltail_avx2$suffix: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa -0xb0(%r10),%xmm6 + vmovdqa -0xa0(%r10),%xmm7 + vmovdqa -0x90(%r10),%xmm8 + vmovdqa -0x80(%r10),%xmm9 + vmovdqa -0x70(%r10),%xmm10 + vmovdqa -0x60(%r10),%xmm11 + vmovdqa -0x50(%r10),%xmm12 + vmovdqa -0x40(%r10),%xmm13 + vmovdqa -0x30(%r10),%xmm14 + vmovdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx2_epilogue$suffix: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +___ +if($avx > 2 && $avx512) { +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea 8(%rsp),%r10 +.cfi_def_cfa_register %r10 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea 8(%rsp),%r10 + sub \$0x1c8,%rsp + vmovdqa %xmm6,-0xb0(%r10) + vmovdqa %xmm7,-0xa0(%r10) + vmovdqa %xmm8,-0x90(%r10) + vmovdqa %xmm9,-0x80(%r10) + vmovdqa %xmm10,-0x70(%r10) + vmovdqa %xmm11,-0x60(%r10) + vmovdqa %xmm12,-0x50(%r10) + vmovdqa %xmm13,-0x40(%r10) + vmovdqa %xmm14,-0x30(%r10) + vmovdqa %xmm15,-0x20(%r10) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2$suffix + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa -0xb0(%r10),%xmm6 + movdqa -0xa0(%r10),%xmm7 + movdqa -0x90(%r10),%xmm8 + movdqa -0x80(%r10),%xmm9 + movdqa -0x70(%r10),%xmm10 + movdqa -0x60(%r10),%xmm11 + movdqa -0x50(%r10),%xmm12 + movdqa -0x40(%r10),%xmm13 + movdqa -0x30(%r10),%xmm14 + movdqa -0x20(%r10),%xmm15 + lea -8(%r10),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea -8(%r10),%rsp +.cfi_def_cfa_register %rsp +___ +$code.=<<___; + ret +.cfi_endproc +___ + +} + +} + +&declare_function("poly1305_blocks_avx2", 32, 4); +poly1305_blocks_avxN(0); +&end_function("poly1305_blocks_avx2"); + +if($kernel) { + $code .= "#endif\n"; +} + +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +if($kernel) { + $code .= "#ifdef CONFIG_AS_AVX512\n"; +} + +&declare_function("poly1305_blocks_avx512", 32, 4); +poly1305_blocks_avxN(1); +&end_function("poly1305_blocks_avx512"); + +if ($kernel) { + $code .= "#endif\n"; +} + +if (!$kernel && $avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + ret +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + ret +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +} + +if (!$kernel) +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 + + .rva .LSEH_begin_poly1305_blocks_x86_64 + .rva .LSEH_end_poly1305_blocks_x86_64 + .rva .LSEH_info_poly1305_blocks_x86_64 + + .rva .LSEH_begin_poly1305_emit_x86_64 + .rva .LSEH_end_poly1305_emit_x86_64 + .rva .LSEH_info_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_init_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 + +.LSEH_info_poly1305_blocks_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit_x86_64: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + if ($kernel) { + s/(^\.type.*),[0-9]+$/\1/; + s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; + next if /^\.cfi.*/; + } + + print $_,"\n"; +} +close STDOUT; diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4a1c05dce950..79bb58737d52 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -1,131 +1,176 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <crypto/algapi.h> #include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> #include <crypto/internal/simd.h> -#include <crypto/poly1305.h> #include <linux/crypto.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <asm/intel-family.h> #include <asm/simd.h> -struct poly1305_simd_desc_ctx { - struct poly1305_desc_ctx base; - /* derived key u set? */ - bool uset; -#ifdef CONFIG_AS_AVX2 - /* derived keys r^3, r^4 set? */ - bool wset; -#endif - /* derived Poly1305 key r^2 */ - u32 u[5]; - /* ... silently appended r^3 and r^4 when using AVX2 */ +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; }; -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, - const u32 *r, unsigned int blocks); -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); -#ifdef CONFIG_AS_AVX2 -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, - unsigned int blocks, const u32 *u); -static bool poly1305_use_avx2; -#endif - -static int poly1305_simd_init(struct shash_desc *desc) +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit + * the unfortunate situation of using AVX and then having to go back to scalar + * -- because the user is silly and has called the update function from two + * separate contexts -- then we need to convert back to the original base before + * proceeding. It is possible to reason that the initial reduction below is + * sufficient given the implementation invariants. However, for an avoidance of + * doubt and because this is not performance critical, we do the full reduction + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py + */ +static void convert_to_base2_64(void *ctx) { - struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); + struct poly1305_arch_internal *state = ctx; + u32 cy; - sctx->uset = false; -#ifdef CONFIG_AS_AVX2 - sctx->wset = false; -#endif + if (!state->is_base2_26) + return; - return crypto_poly1305_init(desc); + cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; + cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; + cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; + cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; + state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; + state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); + state->hs[2] = state->h[4] >> 24; +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); + state->hs[2] &= 3; + state->hs[0] += cy; + state->hs[1] += (cy = ULT(state->hs[0], cy)); + state->hs[2] += ULT(state->hs[1], cy); +#undef ULT + state->is_base2_26 = 0; } -static void poly1305_simd_mult(u32 *a, const u32 *b) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) { - u8 m[POLY1305_BLOCK_SIZE]; - - memset(m, 0, sizeof(m)); - /* The poly1305 block function adds a hi-bit to the accumulator which - * we don't need for key multiplication; compensate for it. */ - a[4] -= 1 << 24; - poly1305_block_sse2(a, m, b, 1); + poly1305_init_x86_64(ctx, key); } -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen) +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, + const u32 padbit) { - struct poly1305_simd_desc_ctx *sctx; - unsigned int blocks, datalen; + struct poly1305_arch_internal *state = ctx; - BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); - sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || + PAGE_SIZE % POLY1305_BLOCK_SIZE); - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || + !crypto_simd_usable()) { + convert_to_base2_64(ctx); + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; } -#ifdef CONFIG_AS_AVX2 - if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(!sctx->wset)) { - if (!sctx->uset) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; - } - memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r.r); - memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r.r); - sctx->wset = true; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); - src += POLY1305_BLOCK_SIZE * 4 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; - } -#endif - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); - src += POLY1305_BLOCK_SIZE * 2 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); - srclen -= POLY1305_BLOCK_SIZE; + for (;;) { + const size_t bytes = min_t(size_t, len, PAGE_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + len -= bytes; + if (!len) + break; + inp += bytes; } - return srclen; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) { - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int bytes; + if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx)) + poly1305_emit_x86_64(ctx, mac, nonce); + else + poly1305_emit_avx(ctx, mac, nonce); +} - /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !crypto_simd_usable()) - return crypto_poly1305_update(desc, src, srclen); +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +{ + poly1305_simd_init(&dctx->h, key); + dctx->s[0] = get_unaligned_le32(&key[16]); + dctx->s[1] = get_unaligned_le32(&key[20]); + dctx->s[2] = get_unaligned_le32(&key[24]); + dctx->s[3] = get_unaligned_le32(&key[28]); + dctx->buflen = 0; + dctx->sset = true; +} +EXPORT_SYMBOL(poly1305_init_arch); + +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, + const u8 *inp, unsigned int len) +{ + unsigned int acc = 0; + if (unlikely(!dctx->sset)) { + if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { + poly1305_simd_init(&dctx->h, inp); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->rset = 1; + } + if (len >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(&inp[0]); + dctx->s[1] = get_unaligned_le32(&inp[4]); + dctx->s[2] = get_unaligned_le32(&inp[8]); + dctx->s[3] = get_unaligned_le32(&inp[12]); + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + acc += POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return acc; +} - kernel_fpu_begin(); +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int srclen) +{ + unsigned int bytes, used; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -135,34 +180,76 @@ static int poly1305_simd_update(struct shash_desc *desc, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); + if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = poly1305_simd_blocks(dctx, src, srclen); - src += srclen - bytes; - srclen = bytes; + bytes = round_down(srclen, POLY1305_BLOCK_SIZE); + srclen -= bytes; + used = crypto_poly1305_setdctxkey(dctx, src, bytes); + if (likely(bytes - used)) + poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); + src += bytes; } - kernel_fpu_end(); - if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) +{ + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_simd_emit(&dctx->h, dst, dctx->s); + *dctx = (struct poly1305_desc_ctx){}; +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int crypto_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + *dctx = (struct poly1305_desc_ctx){}; + return 0; +} + +static int crypto_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_update_arch(dctx, src, srclen); + return 0; +} + +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + poly1305_final_arch(dctx, dst); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, - .init = poly1305_simd_init, - .update = poly1305_simd_update, + .init = crypto_poly1305_init, + .update = crypto_poly1305_update, .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_simd_desc_ctx), + .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", @@ -174,30 +261,33 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); - alg.descsize = sizeof(struct poly1305_simd_desc_ctx); - if (poly1305_use_avx2) - alg.descsize += 10 * sizeof(u32); -#endif - return crypto_register_shash(&alg); + if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); + return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } static void __exit poly1305_simd_mod_exit(void) { - crypto_unregister_shash(&alg); + if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) + crypto_unregister_shash(&alg); } module_init(poly1305_simd_mod_init); module_exit(poly1305_simd_mod_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); MODULE_DESCRIPTION("Poly1305 authenticator"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index ddc51dbba3af..ba9e4c1e7f5c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -555,7 +555,7 @@ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -__serpent_enc_blk8_avx: +SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -606,10 +606,10 @@ __serpent_enc_blk8_avx: write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk8_avx) +SYM_FUNC_END(__serpent_enc_blk8_avx) .align 8 -__serpent_dec_blk8_avx: +SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks @@ -660,9 +660,9 @@ __serpent_dec_blk8_avx: write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_dec_blk8_avx) +SYM_FUNC_END(__serpent_dec_blk8_avx) -ENTRY(serpent_ecb_enc_8way_avx) +SYM_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -678,9 +678,9 @@ ENTRY(serpent_ecb_enc_8way_avx) FRAME_END ret; -ENDPROC(serpent_ecb_enc_8way_avx) +SYM_FUNC_END(serpent_ecb_enc_8way_avx) -ENTRY(serpent_ecb_dec_8way_avx) +SYM_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -696,9 +696,9 @@ ENTRY(serpent_ecb_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_ecb_dec_8way_avx) +SYM_FUNC_END(serpent_ecb_dec_8way_avx) -ENTRY(serpent_cbc_dec_8way_avx) +SYM_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -714,9 +714,9 @@ ENTRY(serpent_cbc_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_cbc_dec_8way_avx) +SYM_FUNC_END(serpent_cbc_dec_8way_avx) -ENTRY(serpent_ctr_8way_avx) +SYM_FUNC_START(serpent_ctr_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -734,9 +734,9 @@ ENTRY(serpent_ctr_8way_avx) FRAME_END ret; -ENDPROC(serpent_ctr_8way_avx) +SYM_FUNC_END(serpent_ctr_8way_avx) -ENTRY(serpent_xts_enc_8way_avx) +SYM_FUNC_START(serpent_xts_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -756,9 +756,9 @@ ENTRY(serpent_xts_enc_8way_avx) FRAME_END ret; -ENDPROC(serpent_xts_enc_8way_avx) +SYM_FUNC_END(serpent_xts_enc_8way_avx) -ENTRY(serpent_xts_dec_8way_avx) +SYM_FUNC_START(serpent_xts_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -778,4 +778,4 @@ ENTRY(serpent_xts_dec_8way_avx) FRAME_END ret; -ENDPROC(serpent_xts_dec_8way_avx) +SYM_FUNC_END(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 37bc1d48106c..c9648aeae705 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -561,7 +561,7 @@ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -__serpent_enc_blk16: +SYM_FUNC_START_LOCAL(__serpent_enc_blk16) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext @@ -612,10 +612,10 @@ __serpent_enc_blk16: write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk16) +SYM_FUNC_END(__serpent_enc_blk16) .align 8 -__serpent_dec_blk16: +SYM_FUNC_START_LOCAL(__serpent_dec_blk16) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext @@ -666,9 +666,9 @@ __serpent_dec_blk16: write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_dec_blk16) +SYM_FUNC_END(__serpent_dec_blk16) -ENTRY(serpent_ecb_enc_16way) +SYM_FUNC_START(serpent_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -688,9 +688,9 @@ ENTRY(serpent_ecb_enc_16way) FRAME_END ret; -ENDPROC(serpent_ecb_enc_16way) +SYM_FUNC_END(serpent_ecb_enc_16way) -ENTRY(serpent_ecb_dec_16way) +SYM_FUNC_START(serpent_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -710,9 +710,9 @@ ENTRY(serpent_ecb_dec_16way) FRAME_END ret; -ENDPROC(serpent_ecb_dec_16way) +SYM_FUNC_END(serpent_ecb_dec_16way) -ENTRY(serpent_cbc_dec_16way) +SYM_FUNC_START(serpent_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -733,9 +733,9 @@ ENTRY(serpent_cbc_dec_16way) FRAME_END ret; -ENDPROC(serpent_cbc_dec_16way) +SYM_FUNC_END(serpent_cbc_dec_16way) -ENTRY(serpent_ctr_16way) +SYM_FUNC_START(serpent_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -758,9 +758,9 @@ ENTRY(serpent_ctr_16way) FRAME_END ret; -ENDPROC(serpent_ctr_16way) +SYM_FUNC_END(serpent_ctr_16way) -ENTRY(serpent_xts_enc_16way) +SYM_FUNC_START(serpent_xts_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -784,9 +784,9 @@ ENTRY(serpent_xts_enc_16way) FRAME_END ret; -ENDPROC(serpent_xts_enc_16way) +SYM_FUNC_END(serpent_xts_enc_16way) -ENTRY(serpent_xts_dec_16way) +SYM_FUNC_START(serpent_xts_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -810,4 +810,4 @@ ENTRY(serpent_xts_dec_16way) FRAME_END ret; -ENDPROC(serpent_xts_dec_16way) +SYM_FUNC_END(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index e5c4a4690ca9..6379b99cb722 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -497,7 +497,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -ENTRY(__serpent_enc_blk_4way) +SYM_FUNC_START(__serpent_enc_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -559,9 +559,9 @@ ENTRY(__serpent_enc_blk_4way) xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; -ENDPROC(__serpent_enc_blk_4way) +SYM_FUNC_END(__serpent_enc_blk_4way) -ENTRY(serpent_dec_blk_4way) +SYM_FUNC_START(serpent_dec_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -613,4 +613,4 @@ ENTRY(serpent_dec_blk_4way) write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); ret; -ENDPROC(serpent_dec_blk_4way) +SYM_FUNC_END(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 5e0b3a3e97af..efb6dc17dc90 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -619,7 +619,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -ENTRY(__serpent_enc_blk_8way) +SYM_FUNC_START(__serpent_enc_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -682,9 +682,9 @@ ENTRY(__serpent_enc_blk_8way) xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -ENDPROC(__serpent_enc_blk_8way) +SYM_FUNC_END(__serpent_enc_blk_8way) -ENTRY(serpent_dec_blk_8way) +SYM_FUNC_START(serpent_dec_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -736,4 +736,4 @@ ENTRY(serpent_dec_blk_8way) write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; -ENDPROC(serpent_dec_blk_8way) +SYM_FUNC_END(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index b871728e0b2f..f973ace44ad3 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -19,18 +19,16 @@ #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ -asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); +asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, +asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, le128 *iv); -asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -44,13 +42,13 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } + .fn_u = { .ecb = serpent_ecb_enc_16way } }, { .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } + .fn_u = { .ecb = serpent_ecb_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -60,13 +58,13 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = 16, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } + .fn_u = { .ctr = serpent_ctr_16way } }, { .num_blocks = 8, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } + .fn_u = { .ctr = serpent_ctr_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + .fn_u = { .ctr = __serpent_crypt_ctr } } } }; @@ -76,13 +74,13 @@ static const struct common_glue_ctx serpent_enc_xts = { .funcs = { { .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } + .fn_u = { .xts = serpent_xts_enc_16way } }, { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + .fn_u = { .xts = serpent_xts_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } + .fn_u = { .xts = serpent_xts_enc } } } }; @@ -92,13 +90,13 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } + .fn_u = { .ecb = serpent_ecb_dec_16way } }, { .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } + .fn_u = { .ecb = serpent_ecb_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -108,13 +106,13 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = 16, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } + .fn_u = { .cbc = serpent_cbc_dec_16way } }, { .num_blocks = 8, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } + .fn_u = { .cbc = serpent_cbc_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -124,13 +122,13 @@ static const struct common_glue_ctx serpent_dec_xts = { .funcs = { { .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } + .fn_u = { .xts = serpent_xts_dec_16way } }, { .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + .fn_u = { .xts = serpent_xts_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + .fn_u = { .xts = serpent_xts_dec } } } }; @@ -146,8 +144,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -166,8 +163,8 @@ static int xts_encrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_enc_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -176,8 +173,8 @@ static int xts_decrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_dec_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 4a9a9f2ee1d8..7806d1cbe854 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -20,33 +20,35 @@ #include <asm/crypto/serpent-avx.h> /* 8-way parallel cipher functions */ -asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); -asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); -asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); -asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); -asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); -asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src, le128 *iv); EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); -void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -56,17 +58,15 @@ void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); -void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__serpent_encrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt); } EXPORT_SYMBOL_GPL(serpent_xts_enc); -void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(__serpent_decrypt)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt); } EXPORT_SYMBOL_GPL(serpent_xts_dec); @@ -102,10 +102,10 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } + .fn_u = { .ecb = serpent_ecb_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -115,10 +115,10 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } + .fn_u = { .ctr = serpent_ctr_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + .fn_u = { .ctr = __serpent_crypt_ctr } } } }; @@ -128,10 +128,10 @@ static const struct common_glue_ctx serpent_enc_xts = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + .fn_u = { .xts = serpent_xts_enc_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } + .fn_u = { .xts = serpent_xts_enc } } } }; @@ -141,10 +141,10 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } + .fn_u = { .ecb = serpent_ecb_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -154,10 +154,10 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } + .fn_u = { .cbc = serpent_cbc_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -167,10 +167,10 @@ static const struct common_glue_ctx serpent_dec_xts = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + .fn_u = { .xts = serpent_xts_dec_8way_avx } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + .fn_u = { .xts = serpent_xts_dec } } } }; @@ -186,8 +186,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), - req); + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -206,8 +205,8 @@ static int xts_encrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_enc_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -216,8 +215,8 @@ static int xts_decrypt(struct skcipher_request *req) struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_dec_xts, req, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + __serpent_encrypt, &ctx->tweak_ctx, + &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 5fdf1931d069..4fed8d26b91a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -31,9 +31,11 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) { u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; unsigned int j; for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) @@ -45,9 +47,11 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; le128_to_be128(&ctrblk, iv); le128_inc(iv); @@ -56,10 +60,12 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) u128_xor(dst, src, (u128 *)&ctrblk); } -static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, +static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; unsigned int i; for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { @@ -79,10 +85,10 @@ static const struct common_glue_ctx serpent_enc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + .fn_u = { .ecb = serpent_enc_blk_xway } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + .fn_u = { .ecb = __serpent_encrypt } } } }; @@ -92,10 +98,10 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + .fn_u = { .ctr = serpent_crypt_ctr_xway } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + .fn_u = { .ctr = serpent_crypt_ctr } } } }; @@ -105,10 +111,10 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + .fn_u = { .ecb = serpent_dec_blk_xway } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .ecb = __serpent_decrypt } } } }; @@ -118,10 +124,10 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + .fn_u = { .cbc = serpent_decrypt_cbc_xway } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + .fn_u = { .cbc = __serpent_decrypt } } } }; @@ -137,7 +143,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), + return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); } diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 9f712a7dfd79..1e594d60afa5 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -62,11 +62,11 @@ *Visit http://software.intel.com/en-us/articles/ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ * - *Updates 20-byte SHA-1 record in 'hash' for even number of - *'num_blocks' consecutive 64-byte blocks + *Updates 20-byte SHA-1 record at start of 'state', from 'input', for + *even number of 'blocks' consecutive 64-byte blocks. * *extern "C" void sha1_transform_avx2( - * int *hash, const char* input, size_t num_blocks ); + * struct sha1_state *state, const u8* input, int blocks ); */ #include <linux/linkage.h> @@ -634,7 +634,7 @@ _loop3: * param: function's name */ .macro SHA1_VECTOR_ASM name - ENTRY(\name) + SYM_FUNC_START(\name) push %rbx push %r12 @@ -676,7 +676,7 @@ _loop3: ret - ENDPROC(\name) + SYM_FUNC_END(\name) .endm .section .rodata diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index ebbdba72ae07..11efe3a45a1f 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -95,7 +95,7 @@ */ .text .align 32 -ENTRY(sha1_ni_transform) +SYM_FUNC_START(sha1_ni_transform) mov %rsp, RSPSAVE sub $FRAME_SIZE, %rsp and $~0xF, %rsp @@ -291,7 +291,7 @@ ENTRY(sha1_ni_transform) mov RSPSAVE, %rsp ret -ENDPROC(sha1_ni_transform) +SYM_FUNC_END(sha1_ni_transform) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 .align 16 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 99c5b8c4dc38..12e2d19d7402 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -67,7 +67,7 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - ENTRY(\name) + SYM_FUNC_START(\name) push %rbx push %r12 @@ -101,7 +101,7 @@ pop %rbx ret - ENDPROC(\name) + SYM_FUNC_END(\name) .endm /* @@ -457,9 +457,13 @@ W_PRECALC_SSSE3 movdqu \a,\b .endm -/* SSSE3 optimized implementation: - * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); +/* + * SSSE3 optimized implementation: + * + * extern "C" void sha1_transform_ssse3(struct sha1_state *state, + * const u8 *data, int blocks); + * + * Note that struct sha1_state is assumed to begin with u32 state[5]. */ SHA1_VECTOR_ASM sha1_transform_ssse3 @@ -545,8 +549,8 @@ W_PRECALC_AVX /* AVX optimized implementation: - * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); + * extern "C" void sha1_transform_avx(struct sha1_state *state, + * const u8 *data, int blocks); */ SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 639d4c2fd6a8..d70b40ad594c 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -27,11 +27,8 @@ #include <crypto/sha1_base.h> #include <asm/simd.h> -typedef void (sha1_transform_fn)(u32 *digest, const char *data, - unsigned int rounds); - static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_transform_fn *sha1_xform) + unsigned int len, sha1_block_fn *sha1_xform) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -39,48 +36,47 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return crypto_sha1_update(desc, data, len); - /* make sure casting to sha1_block_fn() is safe */ + /* + * Make sure struct sha1_state begins directly with the SHA1 + * 160-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); kernel_fpu_begin(); - sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_xform); + sha1_base_do_update(desc, data, len, sha1_xform); kernel_fpu_end(); return 0; } static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) + unsigned int len, u8 *out, sha1_block_fn *sha1_xform) { if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_xform); - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); + sha1_base_do_update(desc, data, len, sha1_xform); + sha1_base_do_finalize(desc, sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); } -asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_ssse3(struct sha1_state *state, + const u8 *data, int blocks); static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_transform_ssse3); + return sha1_update(desc, data, len, sha1_transform_ssse3); } static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_transform_ssse3); + return sha1_finup(desc, data, len, out, sha1_transform_ssse3); } /* Add padding and return the message digest. */ @@ -119,21 +115,19 @@ static void unregister_sha1_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha1_transform_avx(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_avx(struct sha1_state *state, + const u8 *data, int blocks); static int sha1_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_transform_avx); + return sha1_update(desc, data, len, sha1_transform_avx); } static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_transform_avx); + return sha1_finup(desc, data, len, out, sha1_transform_avx); } static int sha1_avx_final(struct shash_desc *desc, u8 *out) @@ -190,8 +184,8 @@ static inline void unregister_sha1_avx(void) { } #if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ -asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks); static bool avx2_usable(void) { @@ -203,28 +197,26 @@ static bool avx2_usable(void) return false; } -static void sha1_apply_transform_avx2(u32 *digest, const char *data, - unsigned int rounds) +static void sha1_apply_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks) { /* Select the optimal transform based on data block size */ - if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(digest, data, rounds); + if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(state, data, blocks); else - sha1_transform_avx(digest, data, rounds); + sha1_transform_avx(state, data, blocks); } static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_apply_transform_avx2); + return sha1_update(desc, data, len, sha1_apply_transform_avx2); } static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_apply_transform_avx2); + return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); } static int sha1_avx2_final(struct shash_desc *desc, u8 *out) @@ -267,21 +259,19 @@ static inline void unregister_sha1_avx2(void) { } #endif #ifdef CONFIG_AS_SHA1_NI -asmlinkage void sha1_ni_transform(u32 *digest, const char *data, - unsigned int rounds); +asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, + int rounds); static int sha1_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha1_update(desc, data, len, - (sha1_transform_fn *) sha1_ni_transform); + return sha1_update(desc, data, len, sha1_ni_transform); } static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - return sha1_finup(desc, data, len, out, - (sha1_transform_fn *) sha1_ni_transform); + return sha1_finup(desc, data, len, out, sha1_ni_transform); } static int sha1_ni_final(struct shash_desc *desc, u8 *out) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 001bbcf93c79..fcbc30f58c38 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -341,13 +341,13 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) +## arg 1 : pointer to state ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_avx) +SYM_FUNC_START(sha256_transform_avx) .align 32 pushq %rbx pushq %r12 @@ -460,7 +460,7 @@ done_hash: popq %r12 popq %rbx ret -ENDPROC(sha256_transform_avx) +SYM_FUNC_END(sha256_transform_avx) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 1420db15dcdd..499d9ec129de 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -520,13 +520,13 @@ STACK_SIZE = _RSP + _RSP_SIZE .endm ######################################################################## -## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) +## arg 1 : pointer to state ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_rorx) +SYM_FUNC_START(sha256_transform_rorx) .align 32 pushq %rbx pushq %r12 @@ -713,7 +713,7 @@ done_hash: popq %r12 popq %rbx ret -ENDPROC(sha256_transform_rorx) +SYM_FUNC_END(sha256_transform_rorx) .section .rodata.cst512.K256, "aM", @progbits, 512 .align 64 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index c6c05ed2c16a..ddfa863b4ee3 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -347,13 +347,15 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to digest +## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, +## int blocks); +## arg 1 : pointer to state +## (struct sha256_state is assumed to begin with u32 state[8]) ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text -ENTRY(sha256_transform_ssse3) +SYM_FUNC_START(sha256_transform_ssse3) .align 32 pushq %rbx pushq %r12 @@ -471,7 +473,7 @@ done_hash: popq %rbx ret -ENDPROC(sha256_transform_ssse3) +SYM_FUNC_END(sha256_transform_ssse3) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index fb58f58ecfbc..7abade04a3a3 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -97,7 +97,7 @@ .text .align 32 -ENTRY(sha256_ni_transform) +SYM_FUNC_START(sha256_ni_transform) shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash @@ -327,7 +327,7 @@ ENTRY(sha256_ni_transform) .Ldone_hash: ret -ENDPROC(sha256_ni_transform) +SYM_FUNC_END(sha256_ni_transform) .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 73867da3cbee..03ad657c04bd 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -41,12 +41,11 @@ #include <linux/string.h> #include <asm/simd.h> -asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, - u64 rounds); -typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); +asmlinkage void sha256_transform_ssse3(struct sha256_state *state, + const u8 *data, int blocks); -static int sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_transform_fn *sha256_xform) +static int _sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha256_block_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -54,28 +53,29 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_update(desc, data, len); - /* make sure casting to sha256_block_fn() is safe */ + /* + * Make sure struct sha256_state begins directly with the SHA256 + * 256-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); kernel_fpu_begin(); - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_xform); + sha256_base_do_update(desc, data, len, sha256_xform); kernel_fpu_end(); return 0; } static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) + unsigned int len, u8 *out, sha256_block_fn *sha256_xform) { if (!crypto_simd_usable()) return crypto_sha256_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_xform); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); + sha256_base_do_update(desc, data, len, sha256_xform); + sha256_base_do_finalize(desc, sha256_xform); kernel_fpu_end(); return sha256_base_finish(desc, out); @@ -84,7 +84,7 @@ static int sha256_finup(struct shash_desc *desc, const u8 *data, static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_ssse3); + return _sha256_update(desc, data, len, sha256_transform_ssse3); } static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, @@ -145,13 +145,13 @@ static void unregister_sha256_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(u32 *digest, const char *data, - u64 rounds); +asmlinkage void sha256_transform_avx(struct sha256_state *state, + const u8 *data, int blocks); static int sha256_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_avx); + return _sha256_update(desc, data, len, sha256_transform_avx); } static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, @@ -227,13 +227,13 @@ static inline void unregister_sha256_avx(void) { } #endif #if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) -asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, - u64 rounds); +asmlinkage void sha256_transform_rorx(struct sha256_state *state, + const u8 *data, int blocks); static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_rorx); + return _sha256_update(desc, data, len, sha256_transform_rorx); } static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, @@ -307,13 +307,13 @@ static inline void unregister_sha256_avx2(void) { } #endif #ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(u32 *digest, const char *data, - u64 rounds); /*unsigned int rounds);*/ +asmlinkage void sha256_ni_transform(struct sha256_state *digest, + const u8 *data, int rounds); static int sha256_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_ni_transform); + return _sha256_update(desc, data, len, sha256_ni_transform); } static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 39235fefe6f7..90ea945ba5e6 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -271,13 +271,14 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_avx(void* D, const void* M, u64 L) -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks +# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks ######################################################################## -ENTRY(sha512_transform_avx) +SYM_FUNC_START(sha512_transform_avx) cmp $0, msglen je nowork @@ -365,7 +366,7 @@ updateblock: nowork: ret -ENDPROC(sha512_transform_avx) +SYM_FUNC_END(sha512_transform_avx) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index b16d56005162..3dd886b14e7d 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -563,13 +563,14 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks +# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks ######################################################################## -ENTRY(sha512_transform_rorx) +SYM_FUNC_START(sha512_transform_rorx) # Allocate Stack Space mov %rsp, %rax sub $frame_size, %rsp @@ -682,7 +683,7 @@ done_hash: # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp ret -ENDPROC(sha512_transform_rorx) +SYM_FUNC_END(sha512_transform_rorx) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 66bbd9058a90..7946a1bee85b 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -269,13 +269,16 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_ssse3(void* D, const void* M, u64 L)# -# Purpose: Updates the SHA512 digest stored at D with the message stored in M. -# The size of the message pointed to by M must be an integer multiple of SHA512 -# message blocks. -# L is the message length in SHA512 blocks. +## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, +## int blocks); +# (struct sha512_state is assumed to begin with u64 state[8]) +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "blocks" is the message length in SHA512 blocks. ######################################################################## -ENTRY(sha512_transform_ssse3) +SYM_FUNC_START(sha512_transform_ssse3) cmp $0, msglen je nowork @@ -364,7 +367,7 @@ updateblock: nowork: ret -ENDPROC(sha512_transform_ssse3) +SYM_FUNC_END(sha512_transform_ssse3) ######################################################################## ### Binary Data diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 458356a3f124..1c444f41037c 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -39,13 +39,11 @@ #include <crypto/sha512_base.h> #include <asm/simd.h> -asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, - u64 rounds); - -typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds); +asmlinkage void sha512_transform_ssse3(struct sha512_state *state, + const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha512_transform_fn *sha512_xform) + unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); @@ -53,28 +51,29 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); - /* make sure casting to sha512_block_fn() is safe */ + /* + * Make sure struct sha512_state begins directly with the SHA512 + * 512-bit internal state, as this is what the asm functions expect. + */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); - sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_xform); + sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) + unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) - sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_xform); - sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform); + sha512_base_do_update(desc, data, len, sha512_xform); + sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); @@ -144,8 +143,8 @@ static void unregister_sha512_ssse3(void) } #ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(u64 *digest, const char *data, - u64 rounds); +asmlinkage void sha512_transform_avx(struct sha512_state *state, + const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { @@ -225,8 +224,8 @@ static inline void unregister_sha512_avx(void) { } #endif #if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) -asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, - u64 rounds); +asmlinkage void sha512_transform_rorx(struct sha512_state *state, + const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 698b8f2a56e2..a5151393bb2f 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -234,7 +234,7 @@ vpxor x3, wkey, x3; .align 8 -__twofish_enc_blk8: +SYM_FUNC_START_LOCAL(__twofish_enc_blk8) /* input: * %rdi: ctx, CTX * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks @@ -273,10 +273,10 @@ __twofish_enc_blk8: outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; -ENDPROC(__twofish_enc_blk8) +SYM_FUNC_END(__twofish_enc_blk8) .align 8 -__twofish_dec_blk8: +SYM_FUNC_START_LOCAL(__twofish_dec_blk8) /* input: * %rdi: ctx, CTX * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks @@ -313,9 +313,9 @@ __twofish_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); ret; -ENDPROC(__twofish_dec_blk8) +SYM_FUNC_END(__twofish_dec_blk8) -ENTRY(twofish_ecb_enc_8way) +SYM_FUNC_START(twofish_ecb_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -333,9 +333,9 @@ ENTRY(twofish_ecb_enc_8way) FRAME_END ret; -ENDPROC(twofish_ecb_enc_8way) +SYM_FUNC_END(twofish_ecb_enc_8way) -ENTRY(twofish_ecb_dec_8way) +SYM_FUNC_START(twofish_ecb_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -353,9 +353,9 @@ ENTRY(twofish_ecb_dec_8way) FRAME_END ret; -ENDPROC(twofish_ecb_dec_8way) +SYM_FUNC_END(twofish_ecb_dec_8way) -ENTRY(twofish_cbc_dec_8way) +SYM_FUNC_START(twofish_cbc_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -378,9 +378,9 @@ ENTRY(twofish_cbc_dec_8way) FRAME_END ret; -ENDPROC(twofish_cbc_dec_8way) +SYM_FUNC_END(twofish_cbc_dec_8way) -ENTRY(twofish_ctr_8way) +SYM_FUNC_START(twofish_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -405,9 +405,9 @@ ENTRY(twofish_ctr_8way) FRAME_END ret; -ENDPROC(twofish_ctr_8way) +SYM_FUNC_END(twofish_ctr_8way) -ENTRY(twofish_xts_enc_8way) +SYM_FUNC_START(twofish_xts_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -429,9 +429,9 @@ ENTRY(twofish_xts_enc_8way) FRAME_END ret; -ENDPROC(twofish_xts_enc_8way) +SYM_FUNC_END(twofish_xts_enc_8way) -ENTRY(twofish_xts_dec_8way) +SYM_FUNC_START(twofish_xts_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -453,4 +453,4 @@ ENTRY(twofish_xts_dec_8way) FRAME_END ret; -ENDPROC(twofish_xts_dec_8way) +SYM_FUNC_END(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 290cc4e9a6fe..a6f09e4f2e46 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -207,7 +207,7 @@ xor %esi, d ## D;\ ror $1, d ## D; -ENTRY(twofish_enc_blk) +SYM_FUNC_START(twofish_enc_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -261,9 +261,9 @@ ENTRY(twofish_enc_blk) pop %ebp mov $1, %eax ret -ENDPROC(twofish_enc_blk) +SYM_FUNC_END(twofish_enc_blk) -ENTRY(twofish_dec_blk) +SYM_FUNC_START(twofish_dec_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -318,4 +318,4 @@ ENTRY(twofish_dec_blk) pop %ebp mov $1, %eax ret -ENDPROC(twofish_dec_blk) +SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index e495e07c7f1b..fc23552afe37 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -220,7 +220,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -ENTRY(__twofish_enc_blk_3way) +SYM_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -267,9 +267,9 @@ ENTRY(__twofish_enc_blk_3way) popq %r12; popq %r13; ret; -ENDPROC(__twofish_enc_blk_3way) +SYM_FUNC_END(__twofish_enc_blk_3way) -ENTRY(twofish_dec_blk_3way) +SYM_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -302,4 +302,4 @@ ENTRY(twofish_dec_blk_3way) popq %r12; popq %r13; ret; -ENDPROC(twofish_dec_blk_3way) +SYM_FUNC_END(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index ecef2cb9f43f..d2e56232494a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -202,7 +202,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -ENTRY(twofish_enc_blk) +SYM_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -253,9 +253,9 @@ ENTRY(twofish_enc_blk) popq R1 movl $1,%eax ret -ENDPROC(twofish_enc_blk) +SYM_FUNC_END(twofish_enc_blk) -ENTRY(twofish_dec_blk) +SYM_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ @@ -305,4 +305,4 @@ ENTRY(twofish_dec_blk) popq R1 movl $1,%eax ret -ENDPROC(twofish_dec_blk) +SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 0dbf8e8b09d7..2dbc8ce3730e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -22,20 +22,17 @@ #define TWOFISH_PARALLEL_BLOCKS 8 /* 8-way parallel cipher functions */ -asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); +asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); -asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); +asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); +asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, + le128 *iv); static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -43,22 +40,19 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, return twofish_setkey(&tfm->base, key, keylen); } -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) +static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(twofish_enc_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk); } -static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) { - glue_xts_crypt_128bit_one(ctx, dst, src, iv, - GLUE_FUNC_CAST(twofish_dec_blk)); + glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk); } struct twofish_xts_ctx { @@ -70,7 +64,6 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *flags = &tfm->base.crt_flags; int err; err = xts_verify_key(tfm, key, keylen); @@ -78,13 +71,12 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, return err; /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2); if (err) return err; /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } static const struct common_glue_ctx twofish_enc = { @@ -93,13 +85,13 @@ static const struct common_glue_ctx twofish_enc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } + .fn_u = { .ecb = twofish_ecb_enc_8way } }, { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + .fn_u = { .ecb = twofish_enc_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + .fn_u = { .ecb = twofish_enc_blk } } } }; @@ -109,13 +101,13 @@ static const struct common_glue_ctx twofish_ctr = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } + .fn_u = { .ctr = twofish_ctr_8way } }, { .num_blocks = 3, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } + .fn_u = { .ctr = twofish_enc_blk_ctr_3way } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } + .fn_u = { .ctr = twofish_enc_blk_ctr } } } }; @@ -125,10 +117,10 @@ static const struct common_glue_ctx twofish_enc_xts = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } + .fn_u = { .xts = twofish_xts_enc_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } + .fn_u = { .xts = twofish_xts_enc } } } }; @@ -138,13 +130,13 @@ static const struct common_glue_ctx twofish_dec = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } + .fn_u = { .ecb = twofish_ecb_dec_8way } }, { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + .fn_u = { .ecb = twofish_dec_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .ecb = twofish_dec_blk } } } }; @@ -154,13 +146,13 @@ static const struct common_glue_ctx twofish_dec_cbc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } + .fn_u = { .cbc = twofish_cbc_dec_8way } }, { .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + .fn_u = { .cbc = twofish_dec_blk_cbc_3way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .cbc = twofish_dec_blk } } } }; @@ -170,10 +162,10 @@ static const struct common_glue_ctx twofish_dec_xts = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } + .fn_u = { .xts = twofish_xts_dec_8way } }, { .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } + .fn_u = { .xts = twofish_xts_dec } } } }; @@ -189,8 +181,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) @@ -208,9 +199,8 @@ static int xts_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&twofish_enc_xts, req, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -218,9 +208,8 @@ static int xts_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_req_128bit(&twofish_dec_xts, req, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk, + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg twofish_algs[] = { diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 1dc9e29f221e..768af6075479 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -25,21 +25,22 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, return twofish_setkey(&tfm->base, key, keylen); } -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) +static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, +static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, true); } -void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) +void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) { u128 ivs[2]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; ivs[0] = src[0]; ivs[1] = src[1]; @@ -51,9 +52,11 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblk; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) *dst = *src; @@ -66,10 +69,11 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); -void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, - le128 *iv) +void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv) { be128 ctrblks[3]; + u128 *dst = (u128 *)d; + const u128 *src = (const u128 *)s; if (dst != src) { dst[0] = src[0]; @@ -94,10 +98,10 @@ static const struct common_glue_ctx twofish_enc = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + .fn_u = { .ecb = twofish_enc_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + .fn_u = { .ecb = twofish_enc_blk } } } }; @@ -107,10 +111,10 @@ static const struct common_glue_ctx twofish_ctr = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) } + .fn_u = { .ctr = twofish_enc_blk_ctr_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) } + .fn_u = { .ctr = twofish_enc_blk_ctr } } } }; @@ -120,10 +124,10 @@ static const struct common_glue_ctx twofish_dec = { .funcs = { { .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + .fn_u = { .ecb = twofish_dec_blk_3way } }, { .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .ecb = twofish_dec_blk } } } }; @@ -133,10 +137,10 @@ static const struct common_glue_ctx twofish_dec_cbc = { .funcs = { { .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + .fn_u = { .cbc = twofish_dec_blk_cbc_3way } }, { .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + .fn_u = { .cbc = twofish_dec_blk } } } }; @@ -152,8 +156,7 @@ static int ecb_decrypt(struct skcipher_request *req) static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), - req); + return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); } static int cbc_decrypt(struct skcipher_request *req) |