Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto fix from Herbert Xu: "This fixes a performance regression in arm64 NEON crypto as well as a crash in x86 aegis/morus on unsupported CPUs" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: crypto: x86/aegis,morus - Fix and simplify CPUID checks crypto: arm64 - revert NEON yield for fast AEAD implementations
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-09 10:00:15 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-09 10:00:15 -0700
commit: 112cbae26d18e75098d95cc234cfa5059de8d479 (patch)
tree: 8e96670829cbbe668330d364265dfcee511f6182 /arch/arm64
parent: 6395ad8559f3a8f4299c027175db00cde67849f7 (diff)
parent: 877ccce7cbe8409256616f5e6bdedb08ce2e82db (diff)
download: talos-obmc-linux-112cbae26d18e75098d95cc234cfa5059de8d479.tar.gz
talos-obmc-linux-112cbae26d18e75098d95cc234cfa5059de8d479.zip
2 files changed, 80 insertions, 146 deletions
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 88f5aef7934c..e3a375c4cb83 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -19,33 +19,24 @@
 	 *			     u32 *macp, u8 const rk[], u32 rounds);
 	 */
 ENTRY(ce_aes_ccm_auth_data)
-	frame_push	7
-
-	mov	x19, x0
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-
-	ldr	w25, [x22]			/* leftover from prev round? */
+	ldr	w8, [x3]			/* leftover from prev round? */
 	ld1	{v0.16b}, [x0]			/* load mac */
-	cbz	w25, 1f
-	sub	w25, w25, #16
+	cbz	w8, 1f
+	sub	w8, w8, #16
 	eor	v1.16b, v1.16b, v1.16b
-0:	ldrb	w7, [x20], #1			/* get 1 byte of input */
-	subs	w21, w21, #1
-	add	w25, w25, #1
+0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
+	subs	w2, w2, #1
+	add	w8, w8, #1
 	ins	v1.b[0], w7
 	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
 	beq	8f				/* out of input? */
-	cbnz	w25, 0b
+	cbnz	w8, 0b
 	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.4s}, [x23]			/* load first round key */
-	prfm	pldl1strm, [x20]
-	cmp	w24, #12			/* which key size? */
-	add	x6, x23, #16
-	sub	w7, w24, #2			/* modified # of rounds */
+1:	ld1	{v3.4s}, [x4]			/* load first round key */
+	prfm	pldl1strm, [x1]
+	cmp	w5, #12				/* which key size? */
+	add	x6, x4, #16
+	sub	w7, w5, #2			/* modified # of rounds */
 	bmi	2f
 	bne	5f
 	mov	v5.16b, v3.16b
@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
 	ld1	{v5.4s}, [x6], #16		/* load next round key */
 	bpl	3b
 	aese	v0.16b, v4.16b
-	subs	w21, w21, #16			/* last data? */
+	subs	w2, w2, #16			/* last data? */
 	eor	v0.16b, v0.16b, v5.16b		/* final round */
 	bmi	6f
-	ld1	{v1.16b}, [x20], #16		/* load next input block */
+	ld1	{v1.16b}, [x1], #16		/* load next input block */
 	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
-	beq	6f
-
-	if_will_cond_yield_neon
-	st1	{v0.16b}, [x19]			/* store mac */
-	do_cond_yield_neon
-	ld1	{v0.16b}, [x19]			/* reload mac */
-	endif_yield_neon
-
-	b	1b
-6:	st1	{v0.16b}, [x19]			/* store mac */
+	bne	1b
+6:	st1	{v0.16b}, [x0]			/* store mac */
 	beq	10f
-	adds	w21, w21, #16
+	adds	w2, w2, #16
 	beq	10f
-	mov	w25, w21
-7:	ldrb	w7, [x20], #1
+	mov	w8, w2
+7:	ldrb	w7, [x1], #1
 	umov	w6, v0.b[0]
 	eor	w6, w6, w7
-	strb	w6, [x19], #1
-	subs	w21, w21, #1
+	strb	w6, [x0], #1
+	subs	w2, w2, #1
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w25
-	add	w25, w25, #16
+8:	mov	w7, w8
+	add	w8, w8, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
 	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.16b}, [x19]
-10:	str	w25, [x22]
-
-	frame_pop
+	st1	{v0.16b}, [x0]
+10:	str	w8, [x3]
 	ret
 ENDPROC(ce_aes_ccm_auth_data)
 
@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
 ENDPROC(ce_aes_ccm_final)
 
 	.macro	aes_ccm_do_crypt,enc
-	frame_push	8
-
-	mov	x19, x0
-	mov	x20, x1
-	mov	x21, x2
-	mov	x22, x3
-	mov	x23, x4
-	mov	x24, x5
-	mov	x25, x6
-
-	ldr	x26, [x25, #8]			/* load lower ctr */
-	ld1	{v0.16b}, [x24]			/* load mac */
-CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
+	ldr	x8, [x6, #8]			/* load lower ctr */
+	ld1	{v0.16b}, [x5]			/* load mac */
+CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
-	ld1	{v1.8b}, [x25]			/* load upper ctr */
-	prfm	pldl1strm, [x20]
-	add	x26, x26, #1
-	rev	x9, x26
-	cmp	w23, #12			/* which key size? */
-	sub	w7, w23, #2			/* get modified # of rounds */
+	ld1	{v1.8b}, [x6]			/* load upper ctr */
+	prfm	pldl1strm, [x1]
+	add	x8, x8, #1
+	rev	x9, x8
+	cmp	w4, #12				/* which key size? */
+	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x22]			/* load first round key */
-	add	x10, x22, #16
+	ld1	{v3.4s}, [x3]			/* load first round key */
+	add	x10, x3, #16
 	bmi	1f
 	bne	4f
 	mov	v5.16b, v3.16b
@@ -194,9 +165,9 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	bpl	2b
 	aese	v0.16b, v4.16b
 	aese	v1.16b, v4.16b
-	subs	w21, w21, #16
-	bmi	7f				/* partial block? */
-	ld1	{v2.16b}, [x20], #16		/* load next input block */
+	subs	w2, w2, #16
+	bmi	6f				/* partial block? */
+	ld1	{v2.16b}, [x1], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
 	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
@@ -205,29 +176,18 @@ CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
 	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
 	.endif
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
-	st1	{v1.16b}, [x19], #16		/* write output block */
-	beq	5f
-
-	if_will_cond_yield_neon
-	st1	{v0.16b}, [x24]			/* store mac */
-	do_cond_yield_neon
-	ld1	{v0.16b}, [x24]			/* reload mac */
-	endif_yield_neon
-
-	b	0b
-5:
-CPU_LE(	rev	x26, x26			)
-	st1	{v0.16b}, [x24]			/* store mac */
-	str	x26, [x25, #8]			/* store lsb end of ctr (BE) */
-
-6:	frame_pop
-	ret
-
-7:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+	st1	{v1.16b}, [x0], #16		/* write output block */
+	bne	0b
+CPU_LE(	rev	x8, x8			)
+	st1	{v0.16b}, [x5]			/* store mac */
+	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
+5:	ret
+
+6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-	st1	{v0.16b}, [x24]			/* store mac */
-	add	w21, w21, #16			/* process partial tail block */
-8:	ldrb	w9, [x20], #1			/* get 1 byte of input */
+	st1	{v0.16b}, [x5]			/* store mac */
+	add	w2, w2, #16			/* process partial tail block */
+7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
 	umov	w6, v1.b[0]			/* get top crypted ctr byte */
 	umov	w7, v0.b[0]			/* get top mac byte */
 	.if	\enc == 1
@@ -237,13 +197,13 @@ CPU_LE(	rev	x26, x26			)
 	eor	w9, w9, w6
 	eor	w7, w7, w9
 	.endif
-	strb	w9, [x19], #1			/* store out byte */
-	strb	w7, [x24], #1			/* store mac byte */
-	subs	w21, w21, #1
-	beq	6b
+	strb	w9, [x0], #1			/* store out byte */
+	strb	w7, [x5], #1			/* store mac byte */
+	subs	w2, w2, #1
+	beq	5b
 	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
 	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
-	b	8b
+	b	7b
 	.endm
 
 	/*
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index dcffb9e77589..c723647b37db 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
 	.endm
 
 	.macro		pmull_gcm_do_crypt, enc
-	frame_push	10
+	ld1		{SHASH.2d}, [x4]
+	ld1		{XL.2d}, [x1]
+	ldr		x8, [x5, #8]			// load lower counter
 
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-	mov		x25, x6
-	mov		x26, x7
-	.if		\enc == 1
-	ldr		x27, [sp, #96]			// first stacked arg
-	.endif
-
-	ldr		x28, [x24, #8]			// load lower counter
-CPU_LE(	rev		x28, x28	)
-
-0:	mov		x0, x25
-	load_round_keys	w26, x0
-	ld1		{SHASH.2d}, [x23]
-	ld1		{XL.2d}, [x20]
+	load_round_keys	w7, x6
 
 	movi		MASK.16b, #0xe1
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	.if		\enc == 1
-	ld1		{KS.16b}, [x27]
+	ldr		x10, [sp]
+	ld1		{KS.16b}, [x10]
 	.endif
 
-1:	ld1		{CTR.8b}, [x24]			// load upper counter
-	ld1		{INP.16b}, [x22], #16
-	rev		x9, x28
-	add		x28, x28, #1
-	sub		w19, w19, #1
+0:	ld1		{CTR.8b}, [x5]			// load upper counter
+	ld1		{INP.16b}, [x3], #16
+	rev		x9, x8
+	add		x8, x8, #1
+	sub		w0, w0, #1
 	ins		CTR.d[1], x9			// set lower counter
 
 	.if		\enc == 1
 	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
 
 	rev64		T1.16b, INP.16b
 
-	cmp		w26, #12
-	b.ge		4f				// AES-192/256?
+	cmp		w7, #12
+	b.ge		2f				// AES-192/256?
 
-2:	enc_round	CTR, v21
+1:	enc_round	CTR, v21
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE(	rev		x28, x28	)
 
 	.if		\enc == 0
 	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x21], #16
+	st1		{INP.16b}, [x2], #16
 	.endif
 
-	cbz		w19, 3f
+	cbnz		w0, 0b
 
-	if_will_cond_yield_neon
-	st1		{XL.2d}, [x20]
-	.if		\enc == 1
-	st1		{KS.16b}, [x27]
-	.endif
-	do_cond_yield_neon
-	b		0b
-	endif_yield_neon
+CPU_LE(	rev		x8, x8		)
+	st1		{XL.2d}, [x1]
+	str		x8, [x5, #8]			// store lower counter
 
-	b		1b
-
-3:	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x27]
+	st1		{KS.16b}, [x10]
 	.endif
 
-CPU_LE(	rev		x28, x28	)
-	str		x28, [x24, #8]			// store lower counter
-
-	frame_pop
 	ret
 
-4:	b.eq		5f				// AES-192?
+2:	b.eq		3f				// AES-192?
 	enc_round	CTR, v17
 	enc_round	CTR, v18
-5:	enc_round	CTR, v19
+3:	enc_round	CTR, v19
 	enc_round	CTR, v20
-	b		2b
+	b		1b
 	.endm
 
 	/*
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-09 10:00:15 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-09 10:00:15 -0700
commit	112cbae26d18e75098d95cc234cfa5059de8d479 (patch)
tree	8e96670829cbbe668330d364265dfcee511f6182 /arch/arm64
parent	6395ad8559f3a8f4299c027175db00cde67849f7 (diff)
parent	877ccce7cbe8409256616f5e6bdedb08ce2e82db (diff)
download	talos-obmc-linux-112cbae26d18e75098d95cc234cfa5059de8d479.tar.gz talos-obmc-linux-112cbae26d18e75098d95cc234cfa5059de8d479.zip