diff options
Diffstat (limited to 'arch/arm')
-rw-r--r-- | arch/arm/Makefile | 20 | ||||
-rw-r--r-- | arch/arm/boot/compressed/libfdt_env.h | 2 | ||||
-rw-r--r-- | arch/arm/boot/dts/imx7d.dtsi | 5 | ||||
-rw-r--r-- | arch/arm/crypto/Kconfig | 7 | ||||
-rw-r--r-- | arch/arm/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/arm/crypto/chacha20-neon-core.S | 277 | ||||
-rw-r--r-- | arch/arm/crypto/crc32-ce-glue.c | 2 | ||||
-rw-r--r-- | arch/arm/crypto/ghash-ce-core.S | 108 | ||||
-rw-r--r-- | arch/arm/crypto/ghash-ce-glue.c | 38 | ||||
-rw-r--r-- | arch/arm/crypto/speck-neon-core.S | 434 | ||||
-rw-r--r-- | arch/arm/crypto/speck-neon-glue.c | 288 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_arm.h | 3 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_host.h | 13 | ||||
-rw-r--r-- | arch/arm/include/asm/kvm_mmu.h | 15 | ||||
-rw-r--r-- | arch/arm/include/asm/stage2_pgtable.h | 54 | ||||
-rw-r--r-- | arch/arm/include/asm/unistd.h | 4 | ||||
-rw-r--r-- | arch/arm/kernel/devtree.c | 5 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 6 | ||||
-rw-r--r-- | arch/arm/mach-at91/pm_suspend.S | 8 | ||||
-rw-r--r-- | arch/arm/mach-mmp/devices.c | 11 | ||||
-rw-r--r-- | arch/arm/mach-shmobile/pm-rcar-gen2.c | 8 | ||||
-rw-r--r-- | arch/arm/mach-shmobile/pm-rmobile.c | 2 | ||||
-rw-r--r-- | arch/arm/mach-shmobile/timer.c | 10 |
23 files changed, 353 insertions, 969 deletions
diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 5c91e0093ee8..05a91d8b89f3 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -303,12 +303,7 @@ else KBUILD_IMAGE := $(boot)/zImage endif -# Build the DT binary blobs if we have OF configured -ifeq ($(CONFIG_USE_OF),y) -KBUILD_DTBS := dtbs -endif - -all: $(notdir $(KBUILD_IMAGE)) $(KBUILD_DTBS) +all: $(notdir $(KBUILD_IMAGE)) archheaders: @@ -335,17 +330,6 @@ $(BOOT_TARGETS): vmlinux $(INSTALL_TARGETS): $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@ -%.dtb: | scripts - $(Q)$(MAKE) $(build)=$(boot)/dts MACHINE=$(MACHINE) $(boot)/dts/$@ - -PHONY += dtbs dtbs_install - -dtbs: prepare scripts - $(Q)$(MAKE) $(build)=$(boot)/dts - -dtbs_install: - $(Q)$(MAKE) $(dtbinst)=$(boot)/dts - PHONY += vdso_install vdso_install: ifeq ($(CONFIG_VDSO),y) @@ -367,8 +351,6 @@ define archhelp echo ' uImage - U-Boot wrapped zImage' echo ' bootpImage - Combined zImage and initial RAM disk' echo ' (supply initrd image via make variable INITRD=<path>)' - echo '* dtbs - Build device tree blobs for enabled boards' - echo ' dtbs_install - Install dtbs to $(INSTALL_DTBS_PATH)' echo ' install - Install uncompressed kernel' echo ' zinstall - Install compressed kernel' echo ' uinstall - Install U-Boot wrapped compressed kernel' diff --git a/arch/arm/boot/compressed/libfdt_env.h b/arch/arm/boot/compressed/libfdt_env.h index 07437816e098..b36c0289a308 100644 --- a/arch/arm/boot/compressed/libfdt_env.h +++ b/arch/arm/boot/compressed/libfdt_env.h @@ -6,6 +6,8 @@ #include <linux/string.h> #include <asm/byteorder.h> +#define INT_MAX ((int)(~0U>>1)) + typedef __be16 fdt16_t; typedef __be32 fdt32_t; typedef __be64 fdt64_t; diff --git a/arch/arm/boot/dts/imx7d.dtsi b/arch/arm/boot/dts/imx7d.dtsi index 7234e8330a57..efbdeaaa8dcd 100644 --- a/arch/arm/boot/dts/imx7d.dtsi +++ b/arch/arm/boot/dts/imx7d.dtsi @@ -146,8 +146,9 @@ fsl,max-link-speed = <2>; power-domains = <&pgc_pcie_phy>; resets = <&src IMX7_RESET_PCIEPHY>, - <&src IMX7_RESET_PCIE_CTRL_APPS_EN>; - reset-names = "pciephy", "apps"; + <&src IMX7_RESET_PCIE_CTRL_APPS_EN>, + <&src IMX7_RESET_PCIE_CTRL_APPS_TURNOFF>; + reset-names = "pciephy", "apps", "turnoff"; status = "disabled"; }; }; diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 925d1364727a..ef0c7feea6e2 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD + select CRYPTO_GF128MUL help Use an implementation of GHASH (used by the GCM AEAD chaining mode) that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) @@ -121,10 +122,4 @@ config CRYPTO_CHACHA20_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 -config CRYPTO_SPECK_NEON - tristate "NEON accelerated Speck cipher algorithms" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_SPECK - endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 8de542c48ade..bd5bceef0605 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o -obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o @@ -54,7 +53,6 @@ ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o -speck-neon-y := speck-neon-core.o speck-neon-glue.o ifdef REGENERATE_ARM_CRYPTO quiet_cmd_perl = PERL $@ diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S index 451a849ad518..50e7b9896818 100644 --- a/arch/arm/crypto/chacha20-neon-core.S +++ b/arch/arm/crypto/chacha20-neon-core.S @@ -18,6 +18,34 @@ * (at your option) any later version. */ + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit + * rotations, the only choices are (a) and (b). We use (a) since it takes + * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + #include <linux/linkage.h> .text @@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon) vmov q10, q2 vmov q11, q3 + adr ip, .Lrol8_table mov r3, #10 + vld1.8 {d10}, [ip, :64] .Ldoubleround: // x0 += x1, x3 = rotl32(x3 ^ x0, 16) @@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon) // x0 += x1, x3 = rotl32(x3 ^ x0, 8) vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #8 - vsri.u32 q3, q4, #24 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) vadd.i32 q2, q2, q3 @@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon) // x0 += x1, x3 = rotl32(x3 ^ x0, 8) vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #8 - vsri.u32 q3, q4, #24 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) vadd.i32 q2, q2, q3 @@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon) bx lr ENDPROC(chacha20_block_xor_neon) + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + .align 5 ENTRY(chacha20_4block_xor_neon) - push {r4-r6, lr} - mov ip, sp // preserve the stack pointer - sub r3, sp, #0x20 // allocate a 32 byte buffer - bic r3, r3, #0x1f // aligned to 32 bytes - mov sp, r3 + push {r4-r5} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip // r0: Input state matrix, s // r1: 4 data blocks output, o @@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon) // This function encrypts four consecutive ChaCha20 blocks by loading // the state matrix in NEON registers four times. The algorithm performs // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. // - // x0..15[0-3] = s0..3[0..3] - add r3, r0, #0x20 + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [r3] + vld1.32 {q2-q3}, [ip] - adr r3, CTRINC + adr r5, .Lctrinc vdup.32 q15, d7[1] vdup.32 q14, d7[0] - vld1.32 {q11}, [r3, :128] + vld1.32 {q4}, [r5, :128] vdup.32 q13, d6[1] vdup.32 q12, d6[0] - vadd.i32 q12, q12, q11 // x12 += counter values 0-3 vdup.32 q11, d5[1] vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 vdup.32 q9, d4[1] vdup.32 q8, d4[0] vdup.32 q7, d3[1] @@ -185,9 +218,13 @@ ENTRY(chacha20_4block_xor_neon) vdup.32 q1, d0[1] vdup.32 q0, d0[0] + adr ip, .Lrol8_table mov r3, #10 + b 1f .Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: // x0 += x4, x12 = rotl32(x12 ^ x0, 16) // x1 += x5, x13 = rotl32(x13 ^ x1, 16) // x2 += x6, x14 = rotl32(x14 ^ x2, 16) @@ -236,24 +273,25 @@ ENTRY(chacha20_4block_xor_neon) // x1 += x5, x13 = rotl32(x13 ^ x1, 8) // x2 += x6, x14 = rotl32(x14 ^ x2, 8) // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] vadd.i32 q0, q0, q4 vadd.i32 q1, q1, q5 vadd.i32 q2, q2, q6 vadd.i32 q3, q3, q7 - veor q8, q12, q0 - veor q9, q13, q1 - vshl.u32 q12, q8, #8 - vshl.u32 q13, q9, #8 - vsri.u32 q12, q8, #24 - vsri.u32 q13, q9, #24 + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 - veor q8, q14, q2 - veor q9, q15, q3 - vshl.u32 q14, q8, #8 - vshl.u32 q15, q9, #8 - vsri.u32 q14, q8, #24 - vsri.u32 q15, q9, #24 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 vld1.32 {q8-q9}, [sp, :256] @@ -332,24 +370,25 @@ ENTRY(chacha20_4block_xor_neon) // x1 += x6, x12 = rotl32(x12 ^ x1, 8) // x2 += x7, x13 = rotl32(x13 ^ x2, 8) // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] vadd.i32 q0, q0, q5 vadd.i32 q1, q1, q6 vadd.i32 q2, q2, q7 vadd.i32 q3, q3, q4 - veor q8, q15, q0 - veor q9, q12, q1 - vshl.u32 q15, q8, #8 - vshl.u32 q12, q9, #8 - vsri.u32 q15, q8, #24 - vsri.u32 q12, q9, #24 + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 - veor q8, q13, q2 - veor q9, q14, q3 - vshl.u32 q13, q8, #8 - vshl.u32 q14, q9, #8 - vsri.u32 q13, q8, #24 - vsri.u32 q14, q9, #24 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 vld1.32 {q8-q9}, [sp, :256] @@ -379,104 +418,76 @@ ENTRY(chacha20_4block_xor_neon) vsri.u32 q6, q9, #25 subs r3, r3, #1 - beq 0f - - vld1.32 {q8-q9}, [sp, :256] - b .Ldoubleround4 - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] -0: ldmia r0!, {r3-r6} - vdup.32 q8, r3 - vdup.32 q9, r4 - vadd.i32 q0, q0, q8 - vadd.i32 q1, q1, q9 - vdup.32 q8, r5 - vdup.32 q9, r6 - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - ldmia r0!, {r3-r6} - vdup.32 q8, r3 - vdup.32 q9, r4 - vadd.i32 q4, q4, q8 - vadd.i32 q5, q5, q9 - vdup.32 q8, r5 - vdup.32 q9, r6 - vadd.i32 q6, q6, q8 - vadd.i32 q7, q7, q9 - - // interleave 32-bit words in state n, n+1 - vzip.32 q0, q1 - vzip.32 q2, q3 - vzip.32 q4, q5 - vzip.32 q6, q7 - - // interleave 64-bit words in state n, n+2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [r5, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 vswp d1, d4 vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 vswp d9, d12 vswp d11, d14 - // xor with corresponding input, write to output + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block vld1.8 {q8-q9}, [r2]! veor q8, q8, q0 - veor q9, q9, q4 + veor q9, q9, q1 vst1.8 {q8-q9}, [r1]! + // Re-interleave the words in the last two rows of each block (x8..15). vld1.32 {q8-q9}, [sp, :256] - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - ldmia r0!, {r3-r6} - vdup.32 q0, r3 - vdup.32 q4, r4 - vadd.i32 q8, q8, q0 - vadd.i32 q9, q9, q4 - vdup.32 q0, r5 - vdup.32 q4, r6 - vadd.i32 q10, q10, q0 - vadd.i32 q11, q11, q4 - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - ldmia r0!, {r3-r6} - vdup.32 q0, r3 - vdup.32 q4, r4 - adr r3, CTRINC - vadd.i32 q12, q12, q0 - vld1.32 {q0}, [r3, :128] - vadd.i32 q13, q13, q4 - vadd.i32 q12, q12, q0 // x12 += counter values 0-3 - - vdup.32 q0, r5 - vdup.32 q4, r6 - vadd.i32 q14, q14, q0 - vadd.i32 q15, q15, q4 - - // interleave 32-bit words in state n, n+1 - vzip.32 q8, q9 - vzip.32 q10, q11 - vzip.32 q12, q13 - vzip.32 q14, q15 - - // interleave 64-bit words in state n, n+2 - vswp d17, d20 - vswp d19, d22 + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 vswp d25, d28 vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 - vmov q4, q1 + // XOR the rest of the data with the keystream vld1.8 {q0-q1}, [r2]! veor q0, q0, q8 @@ -509,13 +520,11 @@ ENTRY(chacha20_4block_xor_neon) vst1.8 {q0-q1}, [r1]! vld1.8 {q0-q1}, [r2] + mov sp, r4 // restore original stack pointer veor q0, q0, q11 veor q1, q1, q15 vst1.8 {q0-q1}, [r1] - mov sp, ip - pop {r4-r6, pc} + pop {r4-r5} + bx lr ENDPROC(chacha20_4block_xor_neon) - - .align 4 -CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index 96e62ec105d0..cd9e93b46c2d 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -236,7 +236,7 @@ static void __exit crc32_pmull_mod_exit(void) ARRAY_SIZE(crc32_pmull_algs)); } -static const struct cpu_feature crc32_cpu_feature[] = { +static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = { { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { } }; MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature); diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index 2f78c10b1881..406009afa9cf 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -63,6 +63,33 @@ k48 .req d31 SHASH2_p64 .req d31 + HH .req q10 + HH3 .req q11 + HH4 .req q12 + HH34 .req q13 + + HH_L .req d20 + HH_H .req d21 + HH3_L .req d22 + HH3_H .req d23 + HH4_L .req d24 + HH4_H .req d25 + HH34_L .req d26 + HH34_H .req d27 + SHASH2_H .req d29 + + XL2 .req q5 + XM2 .req q6 + XH2 .req q7 + T3 .req q8 + + XL2_L .req d10 + XL2_H .req d11 + XM2_L .req d12 + XM2_H .req d13 + T3_L .req d16 + T3_H .req d17 + .text .fpu crypto-neon-fp-armv8 @@ -175,12 +202,77 @@ beq 0f vld1.64 {T1}, [ip] teq r0, #0 - b 1f + b 3f + +0: .ifc \pn, p64 + tst r0, #3 // skip until #blocks is a + bne 2f // round multiple of 4 + + vld1.8 {XL2-XM2}, [r2]! +1: vld1.8 {T3-T2}, [r2]! + vrev64.8 XL2, XL2 + vrev64.8 XM2, XM2 + + subs r0, r0, #4 + + vext.8 T1, XL2, XL2, #8 + veor XL2_H, XL2_H, XL_L + veor XL, XL, T1 + + vrev64.8 T3, T3 + vrev64.8 T1, T2 + + vmull.p64 XH, HH4_H, XL_H // a1 * b1 + veor XL2_H, XL2_H, XL_H + vmull.p64 XL, HH4_L, XL_L // a0 * b0 + vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) + + vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 + veor XM2_L, XM2_L, XM2_H + vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 + vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) + + veor XH, XH, XH2 + veor XL, XL, XL2 + veor XM, XM, XM2 + + vmull.p64 XH2, HH_H, T3_L // a1 * b1 + veor T3_L, T3_L, T3_H + vmull.p64 XL2, HH_L, T3_H // a0 * b0 + vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) + + veor XH, XH, XH2 + veor XL, XL, XL2 + veor XM, XM, XM2 + + vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 + veor T1_L, T1_L, T1_H + vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 + vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) + + veor XH, XH, XH2 + veor XL, XL, XL2 + veor XM, XM, XM2 -0: vld1.64 {T1}, [r2]! + beq 4f + + vld1.8 {XL2-XM2}, [r2]! + + veor T1, XL, XH + veor XM, XM, T1 + + __pmull_reduce_p64 + + veor T1, T1, XH + veor XL, XL, T1 + + b 1b + .endif + +2: vld1.64 {T1}, [r2]! subs r0, r0, #1 -1: /* multiply XL by SHASH in GF(2^128) */ +3: /* multiply XL by SHASH in GF(2^128) */ #ifndef CONFIG_CPU_BIG_ENDIAN vrev64.8 T1, T1 #endif @@ -193,7 +285,7 @@ __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) - veor T1, XL, XH +4: veor T1, XL, XH veor XM, XM, T1 __pmull_reduce_\pn @@ -212,8 +304,14 @@ * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update_p64) - vld1.64 {SHASH}, [r3] + vld1.64 {SHASH}, [r3]! + vld1.64 {HH}, [r3]! + vld1.64 {HH3-HH4}, [r3] + veor SHASH2_p64, SHASH_L, SHASH_H + veor SHASH2_H, HH_L, HH_H + veor HH34_L, HH3_L, HH3_H + veor HH34_H, HH4_L, HH4_H vmov.i8 MASK, #0xe1 vshl.u64 MASK, MASK, #57 diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 8930fc4e7c22..b7d30b6cf49c 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. * - * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_DIGEST_SIZE 16 struct ghash_key { - u64 a; - u64 b; + u64 h[2]; + u64 h2[2]; + u64 h3[2]; + u64 h4[2]; }; struct ghash_desc_ctx { @@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } +static void ghash_reflect(u64 h[], const be128 *k) +{ + u64 carry = be64_to_cpu(k->a) >> 63; + + h[0] = (be64_to_cpu(k->b) << 1) | carry; + h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63); + + if (carry) + h[1] ^= 0xc200000000000000UL; +} + static int ghash_setkey(struct crypto_shash *tfm, const u8 *inkey, unsigned int keylen) { struct ghash_key *key = crypto_shash_ctx(tfm); - u64 a, b; + be128 h, k; if (keylen != GHASH_BLOCK_SIZE) { crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } - /* perform multiplication by 'x' in GF(2^128) */ - b = get_unaligned_be64(inkey); - a = get_unaligned_be64(inkey + 8); + memcpy(&k, inkey, GHASH_BLOCK_SIZE); + ghash_reflect(key->h, &k); + + h = k; + gf128mul_lle(&h, &k); + ghash_reflect(key->h2, &h); - key->a = (a << 1) | (b >> 63); - key->b = (b << 1) | (a >> 63); + gf128mul_lle(&h, &k); + ghash_reflect(key->h3, &h); - if (b >> 63) - key->b ^= 0xc200000000000000UL; + gf128mul_lle(&h, &k); + ghash_reflect(key->h4, &h); return 0; } diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S deleted file mode 100644 index 57caa742016e..000000000000 --- a/arch/arm/crypto/speck-neon-core.S +++ /dev/null @@ -1,434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS - * - * Copyright (c) 2018 Google, Inc - * - * Author: Eric Biggers <ebiggers@google.com> - */ - -#include <linux/linkage.h> - - .text - .fpu neon - - // arguments - ROUND_KEYS .req r0 // const {u64,u32} *round_keys - NROUNDS .req r1 // int nrounds - DST .req r2 // void *dst - SRC .req r3 // const void *src - NBYTES .req r4 // unsigned int nbytes - TWEAK .req r5 // void *tweak - - // registers which hold the data being encrypted/decrypted - X0 .req q0 - X0_L .req d0 - X0_H .req d1 - Y0 .req q1 - Y0_H .req d3 - X1 .req q2 - X1_L .req d4 - X1_H .req d5 - Y1 .req q3 - Y1_H .req d7 - X2 .req q4 - X2_L .req d8 - X2_H .req d9 - Y2 .req q5 - Y2_H .req d11 - X3 .req q6 - X3_L .req d12 - X3_H .req d13 - Y3 .req q7 - Y3_H .req d15 - - // the round key, duplicated in all lanes - ROUND_KEY .req q8 - ROUND_KEY_L .req d16 - ROUND_KEY_H .req d17 - - // index vector for vtbl-based 8-bit rotates - ROTATE_TABLE .req d18 - - // multiplication table for updating XTS tweaks - GF128MUL_TABLE .req d19 - GF64MUL_TABLE .req d19 - - // current XTS tweak value(s) - TWEAKV .req q10 - TWEAKV_L .req d20 - TWEAKV_H .req d21 - - TMP0 .req q12 - TMP0_L .req d24 - TMP0_H .req d25 - TMP1 .req q13 - TMP2 .req q14 - TMP3 .req q15 - - .align 4 -.Lror64_8_table: - .byte 1, 2, 3, 4, 5, 6, 7, 0 -.Lror32_8_table: - .byte 1, 2, 3, 0, 5, 6, 7, 4 -.Lrol64_8_table: - .byte 7, 0, 1, 2, 3, 4, 5, 6 -.Lrol32_8_table: - .byte 3, 0, 1, 2, 7, 4, 5, 6 -.Lgf128mul_table: - .byte 0, 0x87 - .fill 14 -.Lgf64mul_table: - .byte 0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b - .fill 12 - -/* - * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time - * - * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for - * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes - * of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64. - * - * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because - * the vtbl approach is faster on some processors and the same speed on others. - */ -.macro _speck_round_128bytes n - - // x = ror(x, 8) - vtbl.8 X0_L, {X0_L}, ROTATE_TABLE - vtbl.8 X0_H, {X0_H}, ROTATE_TABLE - vtbl.8 X1_L, {X1_L}, ROTATE_TABLE - vtbl.8 X1_H, {X1_H}, ROTATE_TABLE - vtbl.8 X2_L, {X2_L}, ROTATE_TABLE - vtbl.8 X2_H, {X2_H}, ROTATE_TABLE - vtbl.8 X3_L, {X3_L}, ROTATE_TABLE - vtbl.8 X3_H, {X3_H}, ROTATE_TABLE - - // x += y - vadd.u\n X0, Y0 - vadd.u\n X1, Y1 - vadd.u\n X2, Y2 - vadd.u\n X3, Y3 - - // x ^= k - veor X0, ROUND_KEY - veor X1, ROUND_KEY - veor X2, ROUND_KEY - veor X3, ROUND_KEY - - // y = rol(y, 3) - vshl.u\n TMP0, Y0, #3 - vshl.u\n TMP1, Y1, #3 - vshl.u\n TMP2, Y2, #3 - vshl.u\n TMP3, Y3, #3 - vsri.u\n TMP0, Y0, #(\n - 3) - vsri.u\n TMP1, Y1, #(\n - 3) - vsri.u\n TMP2, Y2, #(\n - 3) - vsri.u\n TMP3, Y3, #(\n - 3) - - // y ^= x - veor Y0, TMP0, X0 - veor Y1, TMP1, X1 - veor Y2, TMP2, X2 - veor Y3, TMP3, X3 -.endm - -/* - * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time - * - * This is the inverse of _speck_round_128bytes(). - */ -.macro _speck_unround_128bytes n - - // y ^= x - veor TMP0, Y0, X0 - veor TMP1, Y1, X1 - veor TMP2, Y2, X2 - veor TMP3, Y3, X3 - - // y = ror(y, 3) - vshr.u\n Y0, TMP0, #3 - vshr.u\n Y1, TMP1, #3 - vshr.u\n Y2, TMP2, #3 - vshr.u\n Y3, TMP3, #3 - vsli.u\n Y0, TMP0, #(\n - 3) - vsli.u\n Y1, TMP1, #(\n - 3) - vsli.u\n Y2, TMP2, #(\n - 3) - vsli.u\n Y3, TMP3, #(\n - 3) - - // x ^= k - veor X0, ROUND_KEY - veor X1, ROUND_KEY - veor X2, ROUND_KEY - veor X3, ROUND_KEY - - // x -= y - vsub.u\n X0, Y0 - vsub.u\n X1, Y1 - vsub.u\n X2, Y2 - vsub.u\n X3, Y3 - - // x = rol(x, 8); - vtbl.8 X0_L, {X0_L}, ROTATE_TABLE - vtbl.8 X0_H, {X0_H}, ROTATE_TABLE - vtbl.8 X1_L, {X1_L}, ROTATE_TABLE - vtbl.8 X1_H, {X1_H}, ROTATE_TABLE - vtbl.8 X2_L, {X2_L}, ROTATE_TABLE - vtbl.8 X2_H, {X2_H}, ROTATE_TABLE - vtbl.8 X3_L, {X3_L}, ROTATE_TABLE - vtbl.8 X3_H, {X3_H}, ROTATE_TABLE -.endm - -.macro _xts128_precrypt_one dst_reg, tweak_buf, tmp - - // Load the next source block - vld1.8 {\dst_reg}, [SRC]! - - // Save the current tweak in the tweak buffer - vst1.8 {TWEAKV}, [\tweak_buf:128]! - - // XOR the next source block with the current tweak - veor \dst_reg, TWEAKV - - /* - * Calculate the next tweak by multiplying the current one by x, - * modulo p(x) = x^128 + x^7 + x^2 + x + 1. - */ - vshr.u64 \tmp, TWEAKV, #63 - vshl.u64 TWEAKV, #1 - veor TWEAKV_H, \tmp\()_L - vtbl.8 \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H - veor TWEAKV_L, \tmp\()_H -.endm - -.macro _xts64_precrypt_two dst_reg, tweak_buf, tmp - - // Load the next two source blocks - vld1.8 {\dst_reg}, [SRC]! - - // Save the current two tweaks in the tweak buffer - vst1.8 {TWEAKV}, [\tweak_buf:128]! - - // XOR the next two source blocks with the current two tweaks - veor \dst_reg, TWEAKV - - /* - * Calculate the next two tweaks by multiplying the current ones by x^2, - * modulo p(x) = x^64 + x^4 + x^3 + x + 1. - */ - vshr.u64 \tmp, TWEAKV, #62 - vshl.u64 TWEAKV, #2 - vtbl.8 \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L - vtbl.8 \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H - veor TWEAKV, \tmp -.endm - -/* - * _speck_xts_crypt() - Speck-XTS encryption/decryption - * - * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer - * using Speck-XTS, specifically the variant with a block size of '2n' and round - * count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and - * the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a - * nonzero multiple of 128. - */ -.macro _speck_xts_crypt n, decrypting - push {r4-r7} - mov r7, sp - - /* - * The first four parameters were passed in registers r0-r3. Load the - * additional parameters, which were passed on the stack. - */ - ldr NBYTES, [sp, #16] - ldr TWEAK, [sp, #20] - - /* - * If decrypting, modify the ROUND_KEYS parameter to point to the last - * round key rather than the first, since for decryption the round keys - * are used in reverse order. - */ -.if \decrypting -.if \n == 64 - add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3 - sub ROUND_KEYS, #8 -.else - add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2 - sub ROUND_KEYS, #4 -.endif -.endif - - // Load the index vector for vtbl-based 8-bit rotates -.if \decrypting - ldr r12, =.Lrol\n\()_8_table -.else - ldr r12, =.Lror\n\()_8_table -.endif - vld1.8 {ROTATE_TABLE}, [r12:64] - - // One-time XTS preparation - - /* - * Allocate stack space to store 128 bytes worth of tweaks. For - * performance, this space is aligned to a 16-byte boundary so that we - * can use the load/store instructions that declare 16-byte alignment. - * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'. - */ - sub r12, sp, #128 - bic r12, #0xf - mov sp, r12 - -.if \n == 64 - // Load first tweak - vld1.8 {TWEAKV}, [TWEAK] - - // Load GF(2^128) multiplication table - ldr r12, =.Lgf128mul_table - vld1.8 {GF128MUL_TABLE}, [r12:64] -.else - // Load first tweak - vld1.8 {TWEAKV_L}, [TWEAK] - - // Load GF(2^64) multiplication table - ldr r12, =.Lgf64mul_table - vld1.8 {GF64MUL_TABLE}, [r12:64] - - // Calculate second tweak, packing it together with the first - vshr.u64 TMP0_L, TWEAKV_L, #63 - vtbl.u8 TMP0_L, {GF64MUL_TABLE}, TMP0_L - vshl.u64 TWEAKV_H, TWEAKV_L, #1 - veor TWEAKV_H, TMP0_L -.endif - -.Lnext_128bytes_\@: - - /* - * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak - * values, and save the tweaks on the stack for later. Then - * de-interleave the 'x' and 'y' elements of each block, i.e. make it so - * that the X[0-3] registers contain only the second halves of blocks, - * and the Y[0-3] registers contain only the first halves of blocks. - * (Speck uses the order (y, x) rather than the more intuitive (x, y).) - */ - mov r12, sp -.if \n == 64 - _xts128_precrypt_one X0, r12, TMP0 - _xts128_precrypt_one Y0, r12, TMP0 - _xts128_precrypt_one X1, r12, TMP0 - _xts128_precrypt_one Y1, r12, TMP0 - _xts128_precrypt_one X2, r12, TMP0 - _xts128_precrypt_one Y2, r12, TMP0 - _xts128_precrypt_one X3, r12, TMP0 - _xts128_precrypt_one Y3, r12, TMP0 - vswp X0_L, Y0_H - vswp X1_L, Y1_H - vswp X2_L, Y2_H - vswp X3_L, Y3_H -.else - _xts64_precrypt_two X0, r12, TMP0 - _xts64_precrypt_two Y0, r12, TMP0 - _xts64_precrypt_two X1, r12, TMP0 - _xts64_precrypt_two Y1, r12, TMP0 - _xts64_precrypt_two X2, r12, TMP0 - _xts64_precrypt_two Y2, r12, TMP0 - _xts64_precrypt_two X3, r12, TMP0 - _xts64_precrypt_two Y3, r12, TMP0 - vuzp.32 Y0, X0 - vuzp.32 Y1, X1 - vuzp.32 Y2, X2 - vuzp.32 Y3, X3 -.endif - - // Do the cipher rounds - - mov r12, ROUND_KEYS - mov r6, NROUNDS - -.Lnext_round_\@: -.if \decrypting -.if \n == 64 - vld1.64 ROUND_KEY_L, [r12] - sub r12, #8 - vmov ROUND_KEY_H, ROUND_KEY_L -.else - vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12] - sub r12, #4 -.endif - _speck_unround_128bytes \n -.else -.if \n == 64 - vld1.64 ROUND_KEY_L, [r12]! - vmov ROUND_KEY_H, ROUND_KEY_L -.else - vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]! -.endif - _speck_round_128bytes \n -.endif - subs r6, r6, #1 - bne .Lnext_round_\@ - - // Re-interleave the 'x' and 'y' elements of each block -.if \n == 64 - vswp X0_L, Y0_H - vswp X1_L, Y1_H - vswp X2_L, Y2_H - vswp X3_L, Y3_H -.else - vzip.32 Y0, X0 - vzip.32 Y1, X1 - vzip.32 Y2, X2 - vzip.32 Y3, X3 -.endif - - // XOR the encrypted/decrypted blocks with the tweaks we saved earlier - mov r12, sp - vld1.8 {TMP0, TMP1}, [r12:128]! - vld1.8 {TMP2, TMP3}, [r12:128]! - veor X0, TMP0 - veor Y0, TMP1 - veor X1, TMP2 - veor Y1, TMP3 - vld1.8 {TMP0, TMP1}, [r12:128]! - vld1.8 {TMP2, TMP3}, [r12:128]! - veor X2, TMP0 - veor Y2, TMP1 - veor X3, TMP2 - veor Y3, TMP3 - - // Store the ciphertext in the destination buffer - vst1.8 {X0, Y0}, [DST]! - vst1.8 {X1, Y1}, [DST]! - vst1.8 {X2, Y2}, [DST]! - vst1.8 {X3, Y3}, [DST]! - - // Continue if there are more 128-byte chunks remaining, else return - subs NBYTES, #128 - bne .Lnext_128bytes_\@ - - // Store the next tweak -.if \n == 64 - vst1.8 {TWEAKV}, [TWEAK] -.else - vst1.8 {TWEAKV_L}, [TWEAK] -.endif - - mov sp, r7 - pop {r4-r7} - bx lr -.endm - -ENTRY(speck128_xts_encrypt_neon) - _speck_xts_crypt n=64, decrypting=0 -ENDPROC(speck128_xts_encrypt_neon) - -ENTRY(speck128_xts_decrypt_neon) - _speck_xts_crypt n=64, decrypting=1 -ENDPROC(speck128_xts_decrypt_neon) - -ENTRY(speck64_xts_encrypt_neon) - _speck_xts_crypt n=32, decrypting=0 -ENDPROC(speck64_xts_encrypt_neon) - -ENTRY(speck64_xts_decrypt_neon) - _speck_xts_crypt n=32, decrypting=1 -ENDPROC(speck64_xts_decrypt_neon) diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c deleted file mode 100644 index f012c3ea998f..000000000000 --- a/arch/arm/crypto/speck-neon-glue.c +++ /dev/null @@ -1,288 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS - * - * Copyright (c) 2018 Google, Inc - * - * Note: the NIST recommendation for XTS only specifies a 128-bit block size, - * but a 64-bit version (needed for Speck64) is fairly straightforward; the math - * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial - * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004: - * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes - * OCB and PMAC"), represented as 0x1B. - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> -#include <crypto/algapi.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/skcipher.h> -#include <crypto/speck.h> -#include <crypto/xts.h> -#include <linux/kernel.h> -#include <linux/module.h> - -/* The assembly functions only handle multiples of 128 bytes */ -#define SPECK_NEON_CHUNK_SIZE 128 - -/* Speck128 */ - -struct speck128_xts_tfm_ctx { - struct speck128_tfm_ctx main_key; - struct speck128_tfm_ctx tweak_key; -}; - -asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds, - void *dst, const void *src, - unsigned int nbytes, void *tweak); - -asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds, - void *dst, const void *src, - unsigned int nbytes, void *tweak); - -typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *, - u8 *, const u8 *); -typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *, - const void *, unsigned int, void *); - -static __always_inline int -__speck128_xts_crypt(struct skcipher_request *req, - speck128_crypt_one_t crypt_one, - speck128_xts_crypt_many_t crypt_many) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - le128 tweak; - int err; - - err = skcipher_walk_virt(&walk, req, true); - - crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - u8 *dst = walk.dst.virt.addr; - const u8 *src = walk.src.virt.addr; - - if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) { - unsigned int count; - - count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE); - kernel_neon_begin(); - (*crypt_many)(ctx->main_key.round_keys, - ctx->main_key.nrounds, - dst, src, count, &tweak); - kernel_neon_end(); - dst += count; - src += count; - nbytes -= count; - } - - /* Handle any remainder with generic code */ - while (nbytes >= sizeof(tweak)) { - le128_xor((le128 *)dst, (const le128 *)src, &tweak); - (*crypt_one)(&ctx->main_key, dst, dst); - le128_xor((le128 *)dst, (const le128 *)dst, &tweak); - gf128mul_x_ble(&tweak, &tweak); - - dst += sizeof(tweak); - src += sizeof(tweak); - nbytes -= sizeof(tweak); - } - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static int speck128_xts_encrypt(struct skcipher_request *req) -{ - return __speck128_xts_crypt(req, crypto_speck128_encrypt, - speck128_xts_encrypt_neon); -} - -static int speck128_xts_decrypt(struct skcipher_request *req) -{ - return __speck128_xts_crypt(req, crypto_speck128_decrypt, - speck128_xts_decrypt_neon); -} - -static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - keylen /= 2; - - err = crypto_speck128_setkey(&ctx->main_key, key, keylen); - if (err) - return err; - - return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen); -} - -/* Speck64 */ - -struct speck64_xts_tfm_ctx { - struct speck64_tfm_ctx main_key; - struct speck64_tfm_ctx tweak_key; -}; - -asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds, - void *dst, const void *src, - unsigned int nbytes, void *tweak); - -asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds, - void *dst, const void *src, - unsigned int nbytes, void *tweak); - -typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *, - u8 *, const u8 *); -typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *, - const void *, unsigned int, void *); - -static __always_inline int -__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one, - speck64_xts_crypt_many_t crypt_many) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - __le64 tweak; - int err; - - err = skcipher_walk_virt(&walk, req, true); - - crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - u8 *dst = walk.dst.virt.addr; - const u8 *src = walk.src.virt.addr; - - if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) { - unsigned int count; - - count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE); - kernel_neon_begin(); - (*crypt_many)(ctx->main_key.round_keys, - ctx->main_key.nrounds, - dst, src, count, &tweak); - kernel_neon_end(); - dst += count; - src += count; - nbytes -= count; - } - - /* Handle any remainder with generic code */ - while (nbytes >= sizeof(tweak)) { - *(__le64 *)dst = *(__le64 *)src ^ tweak; - (*crypt_one)(&ctx->main_key, dst, dst); - *(__le64 *)dst ^= tweak; - tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^ - ((tweak & cpu_to_le64(1ULL << 63)) ? - 0x1B : 0)); - dst += sizeof(tweak); - src += sizeof(tweak); - nbytes -= sizeof(tweak); - } - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static int speck64_xts_encrypt(struct skcipher_request *req) -{ - return __speck64_xts_crypt(req, crypto_speck64_encrypt, - speck64_xts_encrypt_neon); -} - -static int speck64_xts_decrypt(struct skcipher_request *req) -{ - return __speck64_xts_crypt(req, crypto_speck64_decrypt, - speck64_xts_decrypt_neon); -} - -static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - keylen /= 2; - - err = crypto_speck64_setkey(&ctx->main_key, key, keylen); - if (err) - return err; - - return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen); -} - -static struct skcipher_alg speck_algs[] = { - { - .base.cra_name = "xts(speck128)", - .base.cra_driver_name = "xts-speck128-neon", - .base.cra_priority = 300, - .base.cra_blocksize = SPECK128_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx), - .base.cra_alignmask = 7, - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SPECK128_128_KEY_SIZE, - .max_keysize = 2 * SPECK128_256_KEY_SIZE, - .ivsize = SPECK128_BLOCK_SIZE, - .walksize = SPECK_NEON_CHUNK_SIZE, - .setkey = speck128_xts_setkey, - .encrypt = speck128_xts_encrypt, - .decrypt = speck128_xts_decrypt, - }, { - .base.cra_name = "xts(speck64)", - .base.cra_driver_name = "xts-speck64-neon", - .base.cra_priority = 300, - .base.cra_blocksize = SPECK64_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx), - .base.cra_alignmask = 7, - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SPECK64_96_KEY_SIZE, - .max_keysize = 2 * SPECK64_128_KEY_SIZE, - .ivsize = SPECK64_BLOCK_SIZE, - .walksize = SPECK_NEON_CHUNK_SIZE, - .setkey = speck64_xts_setkey, - .encrypt = speck64_xts_encrypt, - .decrypt = speck64_xts_decrypt, - } -}; - -static int __init speck_neon_module_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs)); -} - -static void __exit speck_neon_module_exit(void) -{ - crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs)); -} - -module_init(speck_neon_module_init); -module_exit(speck_neon_module_exit); - -MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); -MODULE_ALIAS_CRYPTO("xts(speck128)"); -MODULE_ALIAS_CRYPTO("xts-speck128-neon"); -MODULE_ALIAS_CRYPTO("xts(speck64)"); -MODULE_ALIAS_CRYPTO("xts-speck64-neon"); diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 2d43dca29c72..b95f8d0d9f17 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -133,8 +133,7 @@ * space. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) /* Virtualization Translation Control Register (VTCR) bits */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 3ad482d2f1eb..5ca5d9af0c26 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } @@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + /* + * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, + * so any non-zero value used as type is illegal. + */ + if (type) + return -EINVAL; + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 847f01fa429d..1098ffc3d54b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -35,16 +35,12 @@ addr; \ }) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. - */ -#define KVM_MMU_CACHE_MIN_PAGES 2 - #ifndef __ASSEMBLY__ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/pgalloc.h> #include <asm/stage2_pgtable.h> @@ -52,6 +48,13 @@ /* Ensure compatibility with arm64 */ #define VA_BITS 32 +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK + +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) + int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, @@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) (addr) +static inline void kvm_set_ipa_limit(void) {} + static inline bool kvm_cpu_has_cnp(void) { return false; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 460d616bb2d6..f6a7ea805232 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -19,43 +19,53 @@ #ifndef __ARM_S2_PGTABLE_H_ #define __ARM_S2_PGTABLE_H_ -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) - -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) - -#define stage2_pud_huge(pud) pud_huge(pud) +/* + * kvm_mmu_cache_min_pages() is the number of pages required + * to install a stage-2 translation. We pre-allocate the entry + * level table at VM creation. Since we have a 3 level page-table, + * we need only two pages to add a new mapping. + */ +#define kvm_mmu_cache_min_pages(kvm) 2 + +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) + +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(kvm, pud) pud_huge(pud) /* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_addr_end(kvm, addr, end) (end) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pgd_index(addr) pgd_index(addr) +#define stage2_pgd_index(kvm, addr) pgd_index(addr) -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(pudp) false +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(kvm, pudp) false #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 076090d2dbf5..88ef2ce1f69a 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -16,23 +16,23 @@ #include <uapi/asm/unistd.h> #include <asm/unistd-nr.h> +#define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_OLD_MMAP #define __ARCH_WANT_SYS_OLD_SELECT +#define __ARCH_WANT_SYS_UTIME #if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT) #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_OLD_GETRLIMIT #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_SOCKETCALL diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c index ecaa68dd1af5..13bcd3b867cb 100644 --- a/arch/arm/kernel/devtree.c +++ b/arch/arm/kernel/devtree.c @@ -87,14 +87,11 @@ void __init arm_dt_init_cpu_maps(void) if (!cpus) return; - for_each_child_of_node(cpus, cpu) { + for_each_of_cpu_node(cpu) { const __be32 *cell; int prop_bytes; u32 hwid; - if (of_node_cmp(cpu->type, "cpu")) - continue; - pr_debug(" * %pOF...\n", cpu); /* * A device tree containing CPU nodes with missing "reg" diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 24ac3cab411d..60e375ce1ab2 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -94,12 +94,6 @@ static void __init parse_dt_topology(void) __cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity), GFP_NOWAIT); - cn = of_find_node_by_path("/cpus"); - if (!cn) { - pr_err("No CPU information found in DT\n"); - return; - } - for_each_possible_cpu(cpu) { const u32 *rate; int len; diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S index a7c6ae13c945..bfe1c4d06901 100644 --- a/arch/arm/mach-at91/pm_suspend.S +++ b/arch/arm/mach-at91/pm_suspend.S @@ -149,6 +149,14 @@ exit_suspend: ENDPROC(at91_pm_suspend_in_sram) ENTRY(at91_backup_mode) + /* Switch the master clock source to slow clock. */ + ldr pmc, .pmc_base + ldr tmp1, [pmc, #AT91_PMC_MCKR] + bic tmp1, tmp1, #AT91_PMC_CSS + str tmp1, [pmc, #AT91_PMC_MCKR] + + wait_mckrdy + /*BUMEN*/ ldr r0, .sfr mov tmp1, #0x1 diff --git a/arch/arm/mach-mmp/devices.c b/arch/arm/mach-mmp/devices.c index 671c7a09ab3d..0fca63c80e1a 100644 --- a/arch/arm/mach-mmp/devices.c +++ b/arch/arm/mach-mmp/devices.c @@ -277,21 +277,12 @@ struct platform_device pxa168_device_u2o = { #if IS_ENABLED(CONFIG_USB_EHCI_MV_U2O) struct resource pxa168_u2oehci_resources[] = { - /* regbase */ [0] = { - .start = PXA168_U2O_REGBASE + U2x_CAPREGS_OFFSET, + .start = PXA168_U2O_REGBASE, .end = PXA168_U2O_REGBASE + USB_REG_RANGE, .flags = IORESOURCE_MEM, - .name = "capregs", }, - /* phybase */ [1] = { - .start = PXA168_U2O_PHYBASE, - .end = PXA168_U2O_PHYBASE + USB_PHY_RANGE, - .flags = IORESOURCE_MEM, - .name = "phyregs", - }, - [2] = { .start = IRQ_PXA168_USB1, .end = IRQ_PXA168_USB1, .flags = IORESOURCE_IRQ, diff --git a/arch/arm/mach-shmobile/pm-rcar-gen2.c b/arch/arm/mach-shmobile/pm-rcar-gen2.c index 345af3ebcc3a..7efe95bd584f 100644 --- a/arch/arm/mach-shmobile/pm-rcar-gen2.c +++ b/arch/arm/mach-shmobile/pm-rcar-gen2.c @@ -50,7 +50,7 @@ void __init rcar_gen2_pm_init(void) void __iomem *p; u32 bar; static int once; - struct device_node *np, *cpus; + struct device_node *np; bool has_a7 = false; bool has_a15 = false; struct resource res; @@ -59,11 +59,7 @@ void __init rcar_gen2_pm_init(void) if (once++) return; - cpus = of_find_node_by_path("/cpus"); - if (!cpus) - return; - - for_each_child_of_node(cpus, np) { + for_each_of_cpu_node(np) { if (of_device_is_compatible(np, "arm,cortex-a15")) has_a15 = true; else if (of_device_is_compatible(np, "arm,cortex-a7")) diff --git a/arch/arm/mach-shmobile/pm-rmobile.c b/arch/arm/mach-shmobile/pm-rmobile.c index e348bcfe389d..94fdeef11b81 100644 --- a/arch/arm/mach-shmobile/pm-rmobile.c +++ b/arch/arm/mach-shmobile/pm-rmobile.c @@ -202,7 +202,7 @@ static void __init get_special_pds(void) const struct of_device_id *id; /* PM domains containing CPUs */ - for_each_node_by_type(np, "cpu") + for_each_of_cpu_node(np) add_special_pd(np, PD_CPU); /* PM domain containing console */ diff --git a/arch/arm/mach-shmobile/timer.c b/arch/arm/mach-shmobile/timer.c index 828e8aea037e..e48b0939693f 100644 --- a/arch/arm/mach-shmobile/timer.c +++ b/arch/arm/mach-shmobile/timer.c @@ -22,22 +22,16 @@ void __init shmobile_init_delay(void) { - struct device_node *np, *cpus; + struct device_node *np; u32 max_freq = 0; - cpus = of_find_node_by_path("/cpus"); - if (!cpus) - return; - - for_each_child_of_node(cpus, np) { + for_each_of_cpu_node(np) { u32 freq; if (!of_property_read_u32(np, "clock-frequency", &freq)) max_freq = max(max_freq, freq); } - of_node_put(cpus); - if (!max_freq) return; |