diff options
Diffstat (limited to 'arch/x86')
39 files changed, 3316 insertions, 662 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5a4a089e8b1f..9a2838cf0591 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -30,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -60,6 +62,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o +chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o ifeq ($(avx_supported),yes) @@ -75,6 +78,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + chacha20-x86_64-y += chacha20-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o endif @@ -82,8 +86,10 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o +poly1305-x86_64-y += poly1305-avx2-x86_64.o endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index dccad38b59a8..3633ad6145c5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -803,10 +803,7 @@ static int rfc4106_init(struct crypto_aead *aead) return PTR_ERR(cryptd_tfm); *ctx = cryptd_tfm; - crypto_aead_set_reqsize( - aead, - sizeof(struct aead_request) + - crypto_aead_reqsize(&cryptd_tfm->base)); + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); return 0; } @@ -955,8 +952,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req) /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length equal */ - /* to 8 or 12 bytes */ - if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; /* IV below built */ @@ -992,9 +989,9 @@ static int helper_rfc4106_encrypt(struct aead_request *req) } kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst - + ((unsigned long)req->cryptlen), auth_tag_len); + aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, + ctx->hash_subkey, assoc, req->assoclen - 8, + dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); /* The authTag (aka the Integrity Check Value) needs to be written @@ -1033,12 +1030,12 @@ static int helper_rfc4106_decrypt(struct aead_request *req) struct scatter_walk dst_sg_walk; unsigned int i; - if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ - /* equal to 8 or 12 bytes */ + /* equal to 16 or 20 bytes */ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); /* IV below built */ @@ -1075,8 +1072,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, - authTag, auth_tag_len); + ctx->hash_subkey, assoc, req->assoclen - 8, + authTag, auth_tag_len); kernel_fpu_end(); /* Compare generated tag with passed in tag. */ @@ -1105,19 +1102,12 @@ static int rfc4106_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - struct aead_request *subreq = aead_request_ctx(req); - aead_request_set_tfm(subreq, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - aead_request_set_callback(subreq, req->base.flags, - req->base.complete, req->base.data); - aead_request_set_crypt(subreq, req->src, req->dst, - req->cryptlen, req->iv); - aead_request_set_ad(subreq, req->assoclen); - - return crypto_aead_encrypt(subreq); + return crypto_aead_encrypt(req); } static int rfc4106_decrypt(struct aead_request *req) @@ -1125,19 +1115,12 @@ static int rfc4106_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - struct aead_request *subreq = aead_request_ctx(req); - - aead_request_set_tfm(subreq, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); - aead_request_set_callback(subreq, req->base.flags, - req->base.complete, req->base.data); - aead_request_set_crypt(subreq, req->src, req->dst, - req->cryptlen, req->iv); - aead_request_set_ad(subreq, req->assoclen); + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - return crypto_aead_decrypt(subreq); + return crypto_aead_decrypt(req); } #endif diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S new file mode 100644 index 000000000000..16694e625f77 --- /dev/null +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -0,0 +1,443 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 32 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha20_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: 8 data blocks output, o + # %rdx: 8 data blocks input, i + + # This function encrypts eight consecutive ChaCha20 blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + mov %rsp, %r8 + and $~31, %rsp + sub $0x80, %rsp + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + mov $10,%ecx + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + dec %ecx + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + vmovdqa 0x00(%rsp),%ymm0 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 + vmovdqa %ymm1,0x40(%rsp) + vmovdqa 0x60(%rsp),%ymm0 + vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 + vmovdqa %ymm1,0x60(%rsp) + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + vmovdqa %ymm0,%ymm8 + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + vmovdqa %ymm0,%ymm9 + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + vmovdqa %ymm0,%ymm10 + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + vmovdqa %ymm0,%ymm11 + + # xor with corresponding input, write to output + vmovdqa 0x00(%rsp),%ymm0 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vmovdqa 0x20(%rsp),%ymm0 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vmovdqa 0x40(%rsp),%ymm0 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vmovdqa 0x60(%rsp),%ymm0 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vpxor 0x0100(%rdx),%ymm4,%ymm4 + vmovdqu %ymm4,0x0100(%rsi) + vpxor 0x0180(%rdx),%ymm5,%ymm5 + vmovdqu %ymm5,0x00180(%rsi) + vpxor 0x0140(%rdx),%ymm6,%ymm6 + vmovdqu %ymm6,0x0140(%rsi) + vpxor 0x01c0(%rdx),%ymm7,%ymm7 + vmovdqu %ymm7,0x01c0(%rsi) + vpxor 0x0020(%rdx),%ymm8,%ymm8 + vmovdqu %ymm8,0x0020(%rsi) + vpxor 0x00a0(%rdx),%ymm9,%ymm9 + vmovdqu %ymm9,0x00a0(%rsi) + vpxor 0x0060(%rdx),%ymm10,%ymm10 + vmovdqu %ymm10,0x0060(%rsi) + vpxor 0x00e0(%rdx),%ymm11,%ymm11 + vmovdqu %ymm11,0x00e0(%rsi) + vpxor 0x0120(%rdx),%ymm12,%ymm12 + vmovdqu %ymm12,0x0120(%rsi) + vpxor 0x01a0(%rdx),%ymm13,%ymm13 + vmovdqu %ymm13,0x01a0(%rsi) + vpxor 0x0160(%rdx),%ymm14,%ymm14 + vmovdqu %ymm14,0x0160(%rsi) + vpxor 0x01e0(%rdx),%ymm15,%ymm15 + vmovdqu %ymm15,0x01e0(%rsi) + + vzeroupper + mov %r8,%rsp + ret +ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S new file mode 100644 index 000000000000..712b13047b41 --- /dev/null +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -0,0 +1,625 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 16 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 1 data block output, o + # %rdx: 1 data block input, i + + # This function encrypts one ChaCha20 block by loading the state matrix + # in four SSE registers. It performs matrix operation on four words in + # parallel, but requireds shuffling to rearrange the words after each + # round. 8/16-bit word rotation is done with the slightly better + # performing SSSE3 byte shuffling, 7/12-bit word rotation uses + # traditional shift+OR. + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + + mov $10,%ecx + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + dec %ecx + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + movdqu 0x00(%rdx),%xmm4 + paddd %xmm8,%xmm0 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + movdqu 0x10(%rdx),%xmm5 + paddd %xmm9,%xmm1 + pxor %xmm5,%xmm1 + movdqu %xmm1,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + movdqu 0x20(%rdx),%xmm6 + paddd %xmm10,%xmm2 + pxor %xmm6,%xmm2 + movdqu %xmm2,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + movdqu 0x30(%rdx),%xmm7 + paddd %xmm11,%xmm3 + pxor %xmm7,%xmm3 + movdqu %xmm3,0x30(%rsi) + + ret +ENDPROC(chacha20_block_xor_ssse3) + +ENTRY(chacha20_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 4 data blocks output, o + # %rdx: 4 data blocks input, i + + # This function encrypts four consecutive ChaCha20 blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + sub $0x40,%rsp + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + mov $10,%ecx + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + dec %ecx + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + movdqa 0x10(%rsp),%xmm0 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + movdqa 0x20(%rsp),%xmm0 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + movdqa 0x30(%rsp),%xmm0 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm4 + movdqu %xmm4,0x10(%rsi) + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm5 + movdqu %xmm5,0x90(%rsi) + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm6 + movdqu %xmm6,0x50(%rsi) + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm7 + movdqu %xmm7,0xd0(%rsi) + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm8 + movdqu %xmm8,0x20(%rsi) + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm9 + movdqu %xmm9,0xa0(%rsi) + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm10 + movdqu %xmm10,0x60(%rsi) + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm11 + movdqu %xmm11,0xe0(%rsi) + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm12 + movdqu %xmm12,0x30(%rsi) + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm13 + movdqu %xmm13,0xb0(%rsi) + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm14 + movdqu %xmm14,0x70(%rsi) + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm15 + movdqu %xmm15,0xf0(%rsi) + + add $0x40,%rsp + ret +ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c new file mode 100644 index 000000000000..effe2160b7c5 --- /dev/null +++ b/arch/x86/crypto/chacha20_glue.c @@ -0,0 +1,150 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/chacha20.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +#define CHACHA20_STATE_ALIGN 16 + +asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +static bool chacha20_use_avx2; +#endif + +static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + +#ifdef CONFIG_AS_AVX2 + if (chacha20_use_avx2) { + while (bytes >= CHACHA20_BLOCK_SIZE * 8) { + chacha20_8block_xor_avx2(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 8; + src += CHACHA20_BLOCK_SIZE * 8; + dst += CHACHA20_BLOCK_SIZE * 8; + state[12] += 8; + } + } +#endif + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_ssse3(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; + struct blkcipher_walk walk; + int err; + + if (!may_use_simd()) + return crypto_chacha20_crypt(desc, dst, src, nbytes); + + state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + + kernel_fpu_begin(); + + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); + } + + if (walk.nbytes) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + kernel_fpu_end(); + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_alignmask = sizeof(u32) - 1, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .geniv = "seqiv", + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, + }, +}; + +static int __init chacha20_simd_mod_init(void) +{ + if (!cpu_has_ssse3) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); +#endif + return crypto_register_alg(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S new file mode 100644 index 000000000000..eff2f414e22b --- /dev/null +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -0,0 +1,386 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 32 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff + .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define w0 0x14(%r8) +#define w1 0x18(%r8) +#define w2 0x1c(%r8) +#define w3 0x20(%r8) +#define w4 0x24(%r8) +#define y0 0x28(%r8) +#define y1 0x2c(%r8) +#define y2 0x30(%r8) +#define y3 0x34(%r8) +#define y4 0x38(%r8) +#define m %rsi +#define hc0 %ymm0 +#define hc1 %ymm1 +#define hc2 %ymm2 +#define hc3 %ymm3 +#define hc4 %ymm4 +#define hc0x %xmm0 +#define hc1x %xmm1 +#define hc2x %xmm2 +#define hc3x %xmm3 +#define hc4x %xmm4 +#define t1 %ymm5 +#define t2 %ymm6 +#define t1x %xmm5 +#define t2x %xmm6 +#define ruwy0 %ymm7 +#define ruwy1 %ymm8 +#define ruwy2 %ymm9 +#define ruwy3 %ymm10 +#define ruwy4 %ymm11 +#define ruwy0x %xmm7 +#define ruwy1x %xmm8 +#define ruwy2x %xmm9 +#define ruwy3x %xmm10 +#define ruwy4x %xmm11 +#define svxz1 %ymm12 +#define svxz2 %ymm13 +#define svxz3 %ymm14 +#define svxz4 %ymm15 +#define d0 %r9 +#define d1 %r10 +#define d2 %r11 +#define d3 %r12 +#define d4 %r13 + +ENTRY(poly1305_4block_avx2) + # %rdi: Accumulator h[5] + # %rsi: 64 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Quadblock count + # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], + + # This four-block variant uses loop unrolled block processing. It + # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: + # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r + + vzeroupper + push %rbx + push %r12 + push %r13 + + # combine r0,u0,w0,y0 + vmovd y0,ruwy0x + vmovd w0,t1x + vpunpcklqdq t1,ruwy0,ruwy0 + vmovd u0,t1x + vmovd r0,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy0,ruwy0 + + # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 + vmovd y1,ruwy1x + vmovd w1,t1x + vpunpcklqdq t1,ruwy1,ruwy1 + vmovd u1,t1x + vmovd r1,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy1,ruwy1 + vpslld $2,ruwy1,svxz1 + vpaddd ruwy1,svxz1,svxz1 + + # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 + vmovd y2,ruwy2x + vmovd w2,t1x + vpunpcklqdq t1,ruwy2,ruwy2 + vmovd u2,t1x + vmovd r2,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy2,ruwy2 + vpslld $2,ruwy2,svxz2 + vpaddd ruwy2,svxz2,svxz2 + + # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 + vmovd y3,ruwy3x + vmovd w3,t1x + vpunpcklqdq t1,ruwy3,ruwy3 + vmovd u3,t1x + vmovd r3,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy3,ruwy3 + vpslld $2,ruwy3,svxz3 + vpaddd ruwy3,svxz3,svxz3 + + # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 + vmovd y4,ruwy4x + vmovd w4,t1x + vpunpcklqdq t1,ruwy4,ruwy4 + vmovd u4,t1x + vmovd r4,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy4,ruwy4 + vpslld $2,ruwy4,svxz4 + vpaddd ruwy4,svxz4,svxz4 + +.Ldoblock4: + # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, + # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] + vmovd 0x00(m),hc0x + vmovd 0x10(m),t1x + vpunpcklqdq t1,hc0,hc0 + vmovd 0x20(m),t1x + vmovd 0x30(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc0,hc0 + vpand ANMASK(%rip),hc0,hc0 + vmovd h0,t1x + vpaddd t1,hc0,hc0 + # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, + # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] + vmovd 0x03(m),hc1x + vmovd 0x13(m),t1x + vpunpcklqdq t1,hc1,hc1 + vmovd 0x23(m),t1x + vmovd 0x33(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc1,hc1 + vpsrld $2,hc1,hc1 + vpand ANMASK(%rip),hc1,hc1 + vmovd h1,t1x + vpaddd t1,hc1,hc1 + # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, + # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] + vmovd 0x06(m),hc2x + vmovd 0x16(m),t1x + vpunpcklqdq t1,hc2,hc2 + vmovd 0x26(m),t1x + vmovd 0x36(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc2,hc2 + vpsrld $4,hc2,hc2 + vpand ANMASK(%rip),hc2,hc2 + vmovd h2,t1x + vpaddd t1,hc2,hc2 + # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, + # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] + vmovd 0x09(m),hc3x + vmovd 0x19(m),t1x + vpunpcklqdq t1,hc3,hc3 + vmovd 0x29(m),t1x + vmovd 0x39(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc3,hc3 + vpsrld $6,hc3,hc3 + vpand ANMASK(%rip),hc3,hc3 + vmovd h3,t1x + vpaddd t1,hc3,hc3 + # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), + # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] + vmovd 0x0c(m),hc4x + vmovd 0x1c(m),t1x + vpunpcklqdq t1,hc4,hc4 + vmovd 0x2c(m),t1x + vmovd 0x3c(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc4,hc4 + vpsrld $8,hc4,hc4 + vpor ORMASK(%rip),hc4,hc4 + vmovd h4,t1x + vpaddd t1,hc4,hc4 + + # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] + vpmuludq hc0,ruwy0,t1 + # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] + vpmuludq hc1,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] + vpmuludq hc2,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] + vpmuludq hc3,svxz2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] + vpmuludq hc4,svxz1,t2 + vpaddq t2,t1,t1 + # d0 = t1[0] + t1[1] + t[2] + t[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d0 + + # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] + vpmuludq hc0,ruwy1,t1 + # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] + vpmuludq hc1,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] + vpmuludq hc2,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] + vpmuludq hc3,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] + vpmuludq hc4,svxz2,t2 + vpaddq t2,t1,t1 + # d1 = t1[0] + t1[1] + t1[3] + t1[4] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d1 + + # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] + vpmuludq hc0,ruwy2,t1 + # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] + vpmuludq hc1,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] + vpmuludq hc2,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] + vpmuludq hc3,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] + vpmuludq hc4,svxz3,t2 + vpaddq t2,t1,t1 + # d2 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d2 + + # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] + vpmuludq hc0,ruwy3,t1 + # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] + vpmuludq hc1,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] + vpmuludq hc2,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] + vpmuludq hc3,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] + vpmuludq hc4,svxz4,t2 + vpaddq t2,t1,t1 + # d3 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d3 + + # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] + vpmuludq hc0,ruwy4,t1 + # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] + vpmuludq hc1,ruwy3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] + vpmuludq hc2,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] + vpmuludq hc3,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] + vpmuludq hc4,ruwy0,t2 + vpaddq t2,t1,t1 + # d4 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x40,m + dec %rcx + jnz .Ldoblock4 + + vzeroupper + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_4block_avx2) diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S new file mode 100644 index 000000000000..338c748054ed --- /dev/null +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -0,0 +1,582 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 16 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define s1 0x00(%rsp) +#define s2 0x04(%rsp) +#define s3 0x08(%rsp) +#define s4 0x0c(%rsp) +#define m %rsi +#define h01 %xmm0 +#define h23 %xmm1 +#define h44 %xmm2 +#define t1 %xmm3 +#define t2 %xmm4 +#define t3 %xmm5 +#define t4 %xmm6 +#define mask %xmm7 +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define d4 %r12 + +ENTRY(poly1305_block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Block count + + # This single block variant tries to improve performance by doing two + # multiplications in parallel using SSE instructions. There is quite + # some quardword packing involved, hence the speedup is marginal. + + push %rbx + push %r12 + sub $0x10,%rsp + + # s1..s4 = r1..r4 * 5 + mov r1,%eax + lea (%eax,%eax,4),%eax + mov %eax,s1 + mov r2,%eax + lea (%eax,%eax,4),%eax + mov %eax,s2 + mov r3,%eax + lea (%eax,%eax,4),%eax + mov %eax,s3 + mov r4,%eax + lea (%eax,%eax,4),%eax + mov %eax,s4 + + movdqa ANMASK(%rip),mask + +.Ldoblock: + # h01 = [0, h1, 0, h0] + # h23 = [0, h3, 0, h2] + # h44 = [0, h4, 0, h4] + movd h0,h01 + movd h1,t1 + movd h2,h23 + movd h3,t2 + movd h4,h44 + punpcklqdq t1,h01 + punpcklqdq t2,h23 + punpcklqdq h44,h44 + + # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] + movd 0x00(m),t1 + movd 0x03(m),t2 + psrld $2,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h01 + # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),t1 + movd 0x09(m),t2 + psrld $4,t1 + psrld $6,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h23 + # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] + mov 0x0c(m),%eax + shr $8,%eax + or $0x01000000,%eax + movd %eax,t1 + pshufd $0xc4,t1,t1 + paddd t1,h44 + + # t1[0] = h0 * r0 + h2 * s3 + # t1[1] = h1 * s4 + h3 * s2 + movd r0,t1 + movd s4,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd s3,t2 + movd s2,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r1 + h2 * s4 + # t2[1] = h1 * r0 + h3 * s3 + movd r1,t2 + movd r0,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd s4,t3 + movd s3,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s1 + # t3[1] = h4 * s2 + movd s1,t3 + movd s2,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d0 = t1[0] + t1[1] + t3[0] + # d1 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d0 + psrldq $8,t1 + movq t1,d1 + + # t1[0] = h0 * r2 + h2 * r0 + # t1[1] = h1 * r1 + h3 * s4 + movd r2,t1 + movd r1,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r0,t2 + movd s4,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r3 + h2 * r1 + # t2[1] = h1 * r2 + h3 * r0 + movd r3,t2 + movd r2,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd r1,t3 + movd r0,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s3 + # t3[1] = h4 * s4 + movd s3,t3 + movd s4,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d2 = t1[0] + t1[1] + t3[0] + # d3 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d2 + psrldq $8,t1 + movq t1,d3 + + # t1[0] = h0 * r4 + h2 * r2 + # t1[1] = h1 * r3 + h3 * r1 + movd r4,t1 + movd r3,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r2,t2 + movd r1,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t3[0] = h4 * r0 + movd r0,t3 + pmuludq h44,t3 + # d4 = t1[0] + t1[1] + t3[0] + movdqa t1,t4 + psrldq $8,t4 + paddq t4,t1 + paddq t3,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x10,m + dec %rcx + jnz .Ldoblock + + add $0x10,%rsp + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_block_sse2) + + +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define hc0 %xmm0 +#define hc1 %xmm1 +#define hc2 %xmm2 +#define hc3 %xmm5 +#define hc4 %xmm6 +#define ru0 %xmm7 +#define ru1 %xmm8 +#define ru2 %xmm9 +#define ru3 %xmm10 +#define ru4 %xmm11 +#define sv1 %xmm12 +#define sv2 %xmm13 +#define sv3 %xmm14 +#define sv4 %xmm15 +#undef d0 +#define d0 %r13 + +ENTRY(poly1305_2block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Doubleblock count + # %r8: Poly1305 derived key r^2 u[5] + + # This two-block variant further improves performance by using loop + # unrolled block processing. This is more straight forward and does + # less byte shuffling, but requires a second Poly1305 key r^2: + # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r + + push %rbx + push %r12 + push %r13 + + # combine r0,u0 + movd u0,ru0 + movd r0,t1 + punpcklqdq t1,ru0 + + # combine r1,u1 and s1=r1*5,v1=u1*5 + movd u1,ru1 + movd r1,t1 + punpcklqdq t1,ru1 + movdqa ru1,sv1 + pslld $2,sv1 + paddd ru1,sv1 + + # combine r2,u2 and s2=r2*5,v2=u2*5 + movd u2,ru2 + movd r2,t1 + punpcklqdq t1,ru2 + movdqa ru2,sv2 + pslld $2,sv2 + paddd ru2,sv2 + + # combine r3,u3 and s3=r3*5,v3=u3*5 + movd u3,ru3 + movd r3,t1 + punpcklqdq t1,ru3 + movdqa ru3,sv3 + pslld $2,sv3 + paddd ru3,sv3 + + # combine r4,u4 and s4=r4*5,v4=u4*5 + movd u4,ru4 + movd r4,t1 + punpcklqdq t1,ru4 + movdqa ru4,sv4 + pslld $2,sv4 + paddd ru4,sv4 + +.Ldoblock2: + # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] + movd 0x00(m),hc0 + movd 0x10(m),t1 + punpcklqdq t1,hc0 + pand ANMASK(%rip),hc0 + movd h0,t1 + paddd t1,hc0 + # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] + movd 0x03(m),hc1 + movd 0x13(m),t1 + punpcklqdq t1,hc1 + psrld $2,hc1 + pand ANMASK(%rip),hc1 + movd h1,t1 + paddd t1,hc1 + # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),hc2 + movd 0x16(m),t1 + punpcklqdq t1,hc2 + psrld $4,hc2 + pand ANMASK(%rip),hc2 + movd h2,t1 + paddd t1,hc2 + # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] + movd 0x09(m),hc3 + movd 0x19(m),t1 + punpcklqdq t1,hc3 + psrld $6,hc3 + pand ANMASK(%rip),hc3 + movd h3,t1 + paddd t1,hc3 + # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] + movd 0x0c(m),hc4 + movd 0x1c(m),t1 + punpcklqdq t1,hc4 + psrld $8,hc4 + por ORMASK(%rip),hc4 + movd h4,t1 + paddd t1,hc4 + + # t1 = [ hc0[1] * r0, hc0[0] * u0 ] + movdqa ru0,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * s4, hc1[0] * v4 ] + movdqa sv4,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s3, hc2[0] * v3 ] + movdqa sv3,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s2, hc3[0] * v2 ] + movdqa sv2,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s1, hc4[0] * v1 ] + movdqa sv1,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d0 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d0 + + # t1 = [ hc0[1] * r1, hc0[0] * u1 ] + movdqa ru1,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r0, hc1[0] * u0 ] + movdqa ru0,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s4, hc2[0] * v4 ] + movdqa sv4,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s3, hc3[0] * v3 ] + movdqa sv3,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s2, hc4[0] * v2 ] + movdqa sv2,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d1 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d1 + + # t1 = [ hc0[1] * r2, hc0[0] * u2 ] + movdqa ru2,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r1, hc1[0] * u1 ] + movdqa ru1,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r0, hc2[0] * u0 ] + movdqa ru0,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s4, hc3[0] * v4 ] + movdqa sv4,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s3, hc4[0] * v3 ] + movdqa sv3,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d2 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d2 + + # t1 = [ hc0[1] * r3, hc0[0] * u3 ] + movdqa ru3,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r2, hc1[0] * u2 ] + movdqa ru2,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r1, hc2[0] * u1 ] + movdqa ru1,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r0, hc3[0] * u0 ] + movdqa ru0,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s4, hc4[0] * v4 ] + movdqa sv4,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d3 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d3 + + # t1 = [ hc0[1] * r4, hc0[0] * u4 ] + movdqa ru4,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r3, hc1[0] * u3 ] + movdqa ru3,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r2, hc2[0] * u2 ] + movdqa ru2,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r1, hc3[0] * u1 ] + movdqa ru1,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * r0, hc4[0] * u0 ] + movdqa ru0,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d4 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x20,m + dec %rcx + jnz .Ldoblock2 + + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_2block_sse2) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c new file mode 100644 index 000000000000..f7170d764f32 --- /dev/null +++ b/arch/x86/crypto/poly1305_glue.c @@ -0,0 +1,207 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/poly1305.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +struct poly1305_simd_desc_ctx { + struct poly1305_desc_ctx base; + /* derived key u set? */ + bool uset; +#ifdef CONFIG_AS_AVX2 + /* derived keys r^3, r^4 set? */ + bool wset; +#endif + /* derived Poly1305 key r^2 */ + u32 u[5]; + /* ... silently appended r^3 and r^4 when using AVX2 */ +}; + +asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, + const u32 *r, unsigned int blocks); +asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +#ifdef CONFIG_AS_AVX2 +asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +static bool poly1305_use_avx2; +#endif + +static int poly1305_simd_init(struct shash_desc *desc) +{ + struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); + + sctx->uset = false; +#ifdef CONFIG_AS_AVX2 + sctx->wset = false; +#endif + + return crypto_poly1305_init(desc); +} + +static void poly1305_simd_mult(u32 *a, const u32 *b) +{ + u8 m[POLY1305_BLOCK_SIZE]; + + memset(m, 0, sizeof(m)); + /* The poly1305 block function adds a hi-bit to the accumulator which + * we don't need for key multiplication; compensate for it. */ + a[4] -= 1 << 24; + poly1305_block_sse2(a, m, b, 1); +} + +static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + struct poly1305_simd_desc_ctx *sctx; + unsigned int blocks, datalen; + + BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); + sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(!sctx->wset)) { + if (!sctx->uset) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 5, dctx->r); + memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 10, dctx->r); + sctx->wset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 4); + poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 4 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; + } +#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { + if (unlikely(!sctx->uset)) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 2); + poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 2 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_block_sse2(dctx->h, src, dctx->r, 1); + srclen -= POLY1305_BLOCK_SIZE; + } + return srclen; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int bytes; + + /* kernel_fpu_begin/end is costly, use fallback for small updates */ + if (srclen <= 288 || !may_use_simd()) + return crypto_poly1305_update(desc, src, srclen); + + kernel_fpu_begin(); + + if (unlikely(dctx->buflen)) { + bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + srclen -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + dctx->buflen = 0; + } + } + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = poly1305_simd_blocks(dctx, src, srclen); + src += srclen - bytes; + srclen = bytes; + } + + kernel_fpu_end(); + + if (unlikely(srclen)) { + dctx->buflen = srclen; + memcpy(dctx->buf, src, srclen); + } + + return 0; +} + +static struct shash_alg alg = { + .digestsize = POLY1305_DIGEST_SIZE, + .init = poly1305_simd_init, + .update = poly1305_simd_update, + .final = crypto_poly1305_final, + .setkey = crypto_poly1305_setkey, + .descsize = sizeof(struct poly1305_simd_desc_ctx), + .base = { + .cra_name = "poly1305", + .cra_driver_name = "poly1305-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_alignmask = sizeof(u32) - 1, + .cra_blocksize = POLY1305_BLOCK_SIZE, + .cra_module = THIS_MODULE, + }, +}; + +static int __init poly1305_simd_mod_init(void) +{ + if (!cpu_has_xmm2) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + alg.descsize = sizeof(struct poly1305_simd_desc_ctx); + if (poly1305_use_avx2) + alg.descsize += 10 * sizeof(u32); +#endif + return crypto_register_shash(&alg); +} + +static void __exit poly1305_simd_mod_exit(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(poly1305_simd_mod_init); +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("Poly1305 authenticator"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 9686c3d9ff73..259a7c1ef709 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -21,7 +21,7 @@ * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective * compiler switches. */ -static inline unsigned int __arch_hweight32(unsigned int w) +static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res = 0; @@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w) return __arch_hweight32(w & 0xff); } +#ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; - -#ifdef CONFIG_X86_32 return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); +} #else +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0; + asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); -#endif /* CONFIG_X86_32 */ return res; } +#endif /* CONFIG_X86_32 */ #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49ec9038ec14..c12e845f59e6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -252,6 +252,11 @@ struct kvm_pio_request { int size; }; +struct rsvd_bits_validate { + u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -289,8 +294,15 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; - u64 rsvd_bits_mask[2][4]; - u64 bad_mt_xwr; + + /* + * check zero bits on shadow page table entries, these + * bits include not only hardware reserved bits but also + * the bits spte never used. + */ + struct rsvd_bits_validate shadow_zero_check; + + struct rsvd_bits_validate guest_rsvd_check; /* * Bitmap: bit set = last pte in walk @@ -358,6 +370,11 @@ struct kvm_mtrr { struct list_head head; }; +/* Hyper-V per vcpu emulation context */ +struct kvm_vcpu_hv { + u64 hv_vapic; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -514,8 +531,7 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - u64 hv_vapic; + struct kvm_vcpu_hv hyperv; cpumask_var_t wbinvd_dirty_mask; @@ -586,6 +602,17 @@ struct kvm_apic_map { struct kvm_lapic *logical_map[16][16]; }; +/* Hyper-V emulation context */ +struct kvm_hv { + u64 hv_guest_os_id; + u64 hv_hypercall; + u64 hv_tsc_page; + + /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ + u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; + u64 hv_crash_ctl; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -645,16 +672,14 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; - /* fields used by HYPER-V emulation */ - u64 hv_guest_os_id; - u64 hv_hypercall; - u64 hv_tsc_page; + struct kvm_hv hyperv; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif bool boot_vcpu_runs_old_kvmclock; + u32 bsp_vcpu_id; u64 disabled_quirks; }; @@ -1203,5 +1228,7 @@ int __x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); int x86_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem); +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index c163215abb9a..aaf59b7da98a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ struct ms_hyperv_info { u32 features; + u32 misc_features; u32 hints; }; @@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs); void hv_setup_vmbus_irq(void (*handler)(void)); void hv_remove_vmbus_irq(void); +void hv_setup_kexec_handler(void (*handler)(void)); +void hv_remove_kexec_handler(void); +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); +void hv_remove_crash_handler(void); #endif diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8d3c3d..fa1195dae425 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); - struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da772edd19ab..448b7ca61aee 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -47,6 +47,7 @@ #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 @@ -367,29 +368,29 @@ enum vmcs_field { #define TYPE_PHYSICAL_APIC_EVENT (10 << 12) #define TYPE_PHYSICAL_APIC_INST (15 << 12) -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00 #define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f36d56bd7632..f0412c50c47b 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -27,6 +27,8 @@ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* Partition reference TSC MSR is available */ +#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 1fe92181ee9e..37fee272618f 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -58,6 +58,7 @@ #define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_TRAP_FLAG 37 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 @@ -106,6 +107,7 @@ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dcb52850a28f..cde732c1b495 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1424,7 +1424,7 @@ static inline void __x2apic_disable(void) { u64 msr; - if (cpu_has_apic) + if (!cpu_has_apic) return; rdmsrl(MSR_IA32_APICBASE, msr); @@ -1483,10 +1483,13 @@ void x2apic_setup(void) static __init void x2apic_disable(void) { - u32 x2apic_id; + u32 x2apic_id, state = x2apic_state; - if (x2apic_state != X2APIC_ON) - goto out; + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + if (state != X2APIC_ON) + return; x2apic_id = read_apic_id(); if (x2apic_id >= 255) @@ -1494,9 +1497,6 @@ static __init void x2apic_disable(void) __x2apic_disable(); register_lapic_address(mp_lapic_addr); -out: - x2apic_state = X2APIC_DISABLED; - x2apic_mode = 0; } static __init void x2apic_enable(void) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff103c3..3d6b5269fb2e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -54,9 +54,9 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ ({ \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious rcu_dereference_check_mce() usage"); \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious rcu_dereference_check_mce() usage"); \ smp_load_acquire(&(p)); \ }) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6236a54a63f4..3c986390058a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) return err; } -static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +static void mc_device_remove(struct device *dev, struct subsys_interface *sif) { int cpu = dev->id; if (!cpu_online(cpu)) - return 0; + return; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&dev->kobj, &mc_attr_group); - return 0; } static struct subsys_interface mc_cpu_interface = { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index aad4bd84b475..f794bfa3c138 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/kexec.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -28,10 +29,14 @@ #include <asm/i8259.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/reboot.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); + #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); @@ -67,8 +72,47 @@ void hv_remove_vmbus_irq(void) } EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler); #endif +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + native_machine_shutdown(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + native_machine_crash_shutdown(regs); +} + + static uint32_t __init ms_hyperv_platform(void) { u32 eax; @@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", @@ -141,6 +186,8 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f5791927aa64..c5a5231d1d11 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,7 +136,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) preempt_count_add(HARDIRQ_OFFSET); /* This code is a bit fragile. Test it. */ - rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); return prev_state; } diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 67d215cb8953..a1ff508bb423 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ + hyperv.o + kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c new file mode 100644 index 000000000000..a8160d2ae362 --- /dev/null +++ b/arch/x86/kvm/hyperv.c @@ -0,0 +1,377 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "x86.h" +#include "lapic.h" +#include "hyperv.h" + +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + r = true; + break; + } + + return r; +} + +static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + *pdata = hv->hv_crash_param[index]; + return 0; +} + +static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + *pdata = hv->hv_crash_ctl; + return 0; +} + +static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (host) + hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; + + if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { + + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + + return 0; +} + +static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 data) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + hv->hv_crash_param[index] = data; + return 0; +} + +static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, + bool host) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + hv->hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!hv->hv_guest_os_id) + hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!hv->hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + hv->hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (__copy_to_user((void __user *)addr, instructions, 4)) + return 1; + hv->hv_hypercall = data; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + + memset(&tsc_ref, 0, sizeof(tsc_ref)); + hv->hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest( + kvm, + gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_set_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + data); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + return 0; +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + u64 gfn; + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + hv->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; + break; + } + gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); + if (kvm_is_error_hva(addr)) + return 1; + if (__clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + hv->hv_vapic = data; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, + gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + + return 0; +} + +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = hv->hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = hv->hv_hypercall; + break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = hv->hv_tsc_page; + break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_get_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + pdata); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { + data = r; + break; + } + } + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + data = hv->hv_vapic; + break; + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_set_msr_pw(vcpu, msr, data, host); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_set_msr(vcpu, msr, data); +} + +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_get_msr(vcpu, msr, pdata); +} + +bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + longmode = is_64_bit_mode(vcpu); + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h new file mode 100644 index 000000000000..c7bce559f67b --- /dev/null +++ b/arch/x86/kvm/hyperv.h @@ -0,0 +1,32 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef __ARCH_X86_KVM_HYPERV_H__ +#define __ARCH_X86_KVM_HYPERV_H__ + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +bool kvm_hv_hypercall_enabled(struct kvm *kvm); +int kvm_hv_hypercall(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index fef922ff2635..7cc2360f1848 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -651,15 +651,10 @@ fail_unlock: return NULL; } -void kvm_destroy_pic(struct kvm *kvm) +void kvm_destroy_pic(struct kvm_pic *vpic) { - struct kvm_pic *vpic = kvm->arch.vpic; - - if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); - kvm->arch.vpic = NULL; - kfree(vpic); - } + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ad68c73008c5..3d782a2c336a 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -74,7 +74,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm_pic *vpic); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -85,11 +85,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) static inline int irqchip_in_kernel(struct kvm *kvm) { - int ret; + struct kvm_pic *vpic = pic_irqchip(kvm); - ret = (pic_irqchip(kvm) != NULL); + /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return ret; + return vpic != NULL; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2a5ca97c263b..9a3e342e3cda 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1900,8 +1900,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) + return; apic_set_tpr(vcpu->arch.apic, data & 0xff); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71952748222a..764037991d26 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -91,7 +91,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; + return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44171462bd2a..fb16a8ea3dee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -357,12 +357,6 @@ static u64 __get_spte_lockless(u64 *sptep) { return ACCESS_ONCE(*sptep); } - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - /* It is valid if the spte is zapped. */ - return spte == 0ull; -} #else union split_spte { struct { @@ -478,23 +472,6 @@ retry: return spte.spte; } - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - union split_spte sspte = (union split_spte)spte; - u32 high_mmio_mask = shadow_mmio_mask >> 32; - - /* It is valid if the spte is zapped. */ - if (spte == 0ull) - return true; - - /* It is valid if the spte is being zapped. */ - if (sspte.spte_low == 0ull && - (sspte.spte_high & high_mmio_mask) == high_mmio_mask) - return true; - - return false; -} #endif static bool spte_is_locklessly_modifiable(u64 spte) @@ -3291,54 +3268,89 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static bool +__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { - if (direct) - return vcpu_match_mmio_gpa(vcpu, addr); + int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; - return vcpu_match_mmio_gva(vcpu, addr); + return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | + ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); +} -/* - * On direct hosts, the last spte is only allows two states - * for mmio page fault: - * - It is the mmio spte - * - It is zapped or it is being zapped. - * - * This function completely checks the spte when the last spte - * is not the mmio spte. - */ -static bool check_direct_spte_mmio_pf(u64 spte) +static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) { - return __check_direct_spte_mmio_pf(spte); + return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); } -static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + if (direct) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + +/* return true if reserved bit is detected on spte. */ +static bool +walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 spte = 0ull; + u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; + int root, leaf; + bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return spte; + goto exit; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) + + for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level; + shadow_walk_okay(&iterator); + __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; + spte = mmu_spte_get_lockless(iterator.sptep); + + sptes[leaf - 1] = spte; + if (!is_shadow_present_pte(spte)) break; + + reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + leaf); + } + walk_shadow_page_lockless_end(vcpu); - return spte; + if (reserved) { + pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + __func__, addr); + while (root >= leaf) { + pr_err("------ spte 0x%llx level %d.\n", + sptes[root - 1], root); + root--; + } + } +exit: + *sptep = spte; + return reserved; } int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; + bool reserved; if (quickly_check_mmio_pf(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; - spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + if (unlikely(reserved)) + return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); @@ -3356,13 +3368,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) } /* - * It's ok if the gva is remapped by other cpus on shadow guest, - * it's a BUG if the gfn is not a mmio page. - */ - if (direct && !check_direct_spte_mmio_pf(spte)) - return RET_MMIO_PF_BUG; - - /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ @@ -3604,19 +3609,21 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void +__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, int level, bool nx, bool gbpages, + bool pse) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; - context->bad_mt_xwr = 0; + rsvd_check->bad_mt_xwr = 0; - if (!context->nx) + if (!nx) exb_bit_rsvd = rsvd_bits(63, 63); - if (!guest_cpuid_has_gbpages(vcpu)) + if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); /* @@ -3626,80 +3633,95 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, if (guest_cpuid_is_amd(vcpu)) nonleaf_bit8_rsvd = rsvd_bits(8, 8); - switch (context->root_level) { + switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ - context->rsvd_bits_mask[0][1] = 0; - context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[0][1] = 0; + rsvd_check->rsvd_bits_mask[0][0] = 0; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; + if (!pse) { + rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PTE */ - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: - context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; } } -static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), context->root_level, + context->nx, guest_cpuid_has_gbpages(vcpu), + is_pse(vcpu)); +} + +static void +__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, bool execonly) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); int pte; - context->rsvd_bits_mask[0][3] = + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][1] = + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = + rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - context->rsvd_bits_mask[1][1] = + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; for (pte = 0; pte < 64; pte++) { int rwx_bits = pte & 7; @@ -3707,10 +3729,64 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, if (mt == 0x2 || mt == 0x3 || mt == 0x7 || rwx_bits == 0x2 || rwx_bits == 0x6 || (rwx_bits == 0x4 && !execonly)) - context->bad_mt_xwr |= (1ull << pte); + rsvd_check->bad_mt_xwr |= (1ull << pte); } } +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), execonly); +} + +/* + * the page table on host is the shadow page table for the page + * table in guest or amd nested guest, its mmu features completely + * follow the features in guest. + */ +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, context->nx, + guest_cpuid_has_gbpages(vcpu), is_pse(vcpu)); +} +EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); + +/* + * the direct page table on host, use as much mmu features as + * possible, however, kvm currently does not do execution-protection. + */ +static void +reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + if (guest_cpuid_is_amd(vcpu)) + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, false, + cpu_has_gbpages, true); + else + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + false); + +} + +/* + * as the comments in reset_shadow_zero_bits_mask() except it + * is the shadow page table for intel nested guest. + */ +static void +reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, execonly); +} + static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { @@ -3889,6 +3965,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); + reset_tdp_shadow_zero_bits_mask(vcpu, context); } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) @@ -3916,6 +3993,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); context->base_role.smm = is_smm(vcpu); + reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -3939,6 +4017,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4860,28 +4939,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte; - int nr_sptes = 0; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return nr_sptes; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { - sptes[iterator.level-1] = spte; - nr_sptes++; - if (!is_shadow_present_pte(spte)) - break; - } - walk_shadow_page_lockless_end(vcpu); - - return nr_sptes; -} -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 398d21c0f6dd..e4202e41d535 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -50,9 +50,11 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + /* * Return values of handle_mmio_page_fault_common: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0f67d7e24800..736e6ab8784d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) *access &= mask; } -static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; - - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | - ((mmu->bad_mt_xwr & (1ull << low6)) != 0); -} - static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT @@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!FNAME(is_present_gpte)(gpte)) @@ -353,8 +345,7 @@ retry_walk: if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, - walker->level))) { + if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 886aa25a7131..39b91127ef07 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -133,8 +133,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_K7_PERFCTRn */ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); if (pmc) { - if (!msr_info->host_initiated) - data = (s64)data; pmc->counter += data - pmc_read_counter(pmc); return 0; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8e0c0844c6b9..74d825716f4f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1173,6 +1173,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm)) return 0; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED) && + kvm_read_cr0(vcpu) & X86_CR0_CD) + return _PAGE_NOCACHE; + mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); return mtrr2protval[mtrr]; } @@ -1667,13 +1671,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - /* - * re-enable caching here because the QEMU bios - * does not do it - this results in some delay at - * reboot - */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + + /* These are emulated via page tables. */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); @@ -2106,6 +2107,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); + reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 83b7b5cd75d5..da1590ea43fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2443,10 +2443,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we @@ -3423,12 +3423,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(to_vmx(vcpu)); guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~AR_TYPE_MASK) - | AR_TYPE_BUSY_64_TSS); + (guest_tr_ar & ~VMX_AR_TYPE_MASK) + | VMX_AR_TYPE_BUSY_64_TSS); } vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); } @@ -3719,7 +3719,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) return 0; else { int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return AR_DPL(ar); + return VMX_AR_DPL(ar); } } @@ -3847,11 +3847,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) if (cs.unusable) return false; - if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; - if (cs.type & AR_TYPE_WRITEABLE_MASK) { + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; } else { @@ -3901,7 +3901,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) return false; if (!var.present) return false; - if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ return false; } @@ -5759,73 +5759,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } -static u64 ept_rsvd_mask(u64 spte, int level) -{ - int i; - u64 mask = 0; - - for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) - mask |= (1ULL << i); - - if (level == 4) - /* bits 7:3 reserved */ - mask |= 0xf8; - else if (spte & (1ULL << 7)) - /* - * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, - * level == 1 if the hypervisor is using the ignored bit 7. - */ - mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; - else if (level > 1) - /* bits 6:3 reserved */ - mask |= 0x78; - - return mask; -} - -static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, - int level) -{ - printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); - - /* 010b (write-only) */ - WARN_ON((spte & 0x7) == 0x2); - - /* 110b (write/execute) */ - WARN_ON((spte & 0x7) == 0x6); - - /* 100b (execute-only) and value not supported by logical processor */ - if (!cpu_has_vmx_ept_execute_only()) - WARN_ON((spte & 0x7) == 0x4); - - /* not 000b */ - if ((spte & 0x7)) { - u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); - - if (rsvd_bits != 0) { - printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", - __func__, rsvd_bits); - WARN_ON(1); - } - - /* bits 5:3 are _not_ reserved for large page or leaf page */ - if ((rsvd_bits & 0x38) == 0) { - u64 ept_mem_type = (spte & 0x38) >> 3; - - if (ept_mem_type == 2 || ept_mem_type == 3 || - ept_mem_type == 7) { - printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", - __func__, ept_mem_type); - WARN_ON(1); - } - } - } -} - static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { - u64 sptes[4]; - int nr_sptes, i, ret; + int ret; gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -5846,13 +5782,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; /* It is the real ept misconfig */ - printk(KERN_ERR "EPT: Misconfiguration.\n"); - printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); - - nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); - - for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) - ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + WARN_ON(1); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; @@ -6246,6 +6176,11 @@ static int handle_mwait(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } +static int handle_monitor_trap(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int handle_monitor(struct kvm_vcpu *vcpu) { printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); @@ -6408,8 +6343,12 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) */ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) + u32 vmx_instruction_info, bool wr, gva_t *ret) { + gva_t off; + bool exn; + struct kvm_segment s; + /* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the @@ -6434,22 +6373,63 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); + off = exit_qualification; /* holds the displacement */ if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); + off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ + off += kvm_register_read(vcpu, index_reg)<<scaling; + vmx_get_segment(vcpu, &s, seg_reg); + *ret = s.base + off; if (addr_size == 1) /* 32 bit */ *ret &= 0xffffffff; - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ + /* Checks for #GP/#SS exceptions. */ + exn = false; + if (is_protmode(vcpu)) { + /* Protected mode: apply checks for segment validity in the + * following order: + * - segment type check (#GP(0) may be thrown) + * - usability check (#GP(0)/#SS(0)) + * - limit check (#GP(0)/#SS(0)) + */ + if (wr) + /* #GP(0) if the destination operand is located in a + * read-only data segment or any code segment. + */ + exn = ((s.type & 0xa) == 0 || (s.type & 8)); + else + /* #GP(0) if the source operand is located in an + * execute-only code segment + */ + exn = ((s.type & 0xa) == 8); + } + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is an only check for long mode. + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { + /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. + */ + exn = (s.unusable != 0); + /* Protected mode: #GP(0)/#SS(0) if the memory + * operand is outside the segment limit. + */ + exn = exn || (off + sizeof(u64) > s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + return 0; } @@ -6471,7 +6451,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, @@ -6999,7 +6979,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, true, &gva)) return 1; /* _system ok, as nested_vmx_check_permission verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, @@ -7036,7 +7016,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { @@ -7128,7 +7108,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &vmcs_gva)) + vmx_instruction_info, true, &vmcs_gva)) return 1; /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, @@ -7184,7 +7164,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, sizeof(operand), &e)) { @@ -7282,6 +7262,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, @@ -7542,6 +7523,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f0f6eca69da..4bbc2a1676c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -29,6 +29,7 @@ #include "cpuid.h" #include "assigned-dev.h" #include "pmu.h" +#include "hyperv.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -221,11 +222,9 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); + shared_msrs_global.msrs[slot] = msr; if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot] = msr; - /* we need ensured the shared_msr_global have been updated */ - smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); @@ -526,7 +525,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if (is_present_gpte(pdpte[i]) && - (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + (pdpte[i] & + vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -949,6 +949,8 @@ static u32 emulated_msrs[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1217,11 +1219,6 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, __func__, base_khz, scaled_khz, shift, *pmultiplier); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif @@ -1869,123 +1866,6 @@ out: return r; } -static bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - case HV_X64_MSR_REFERENCE_TSC: - case HV_X64_MSR_TIME_REF_COUNT: - r = true; - break; - } - - return r; -} - -static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - kvm->arch.hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!kvm->arch.hv_guest_os_id) - kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!kvm->arch.hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - kvm->arch.hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - kvm->arch.hv_hypercall = data; - mark_page_dirty(kvm, gfn); - break; - } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - memset(&tsc_ref, 0, sizeof(tsc_ref)); - kvm->arch.hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); - break; - } - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - return 0; -} - -static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - vcpu->arch.hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - vcpu->arch.hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) - return 1; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - - return 0; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -2224,15 +2104,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = set_msr_hyperv_pw(vcpu, msr, data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return set_msr_hyperv(vcpu, msr, data); - break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_set_msr_common(vcpu, msr, data, + msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. @@ -2315,68 +2190,6 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = kvm->arch.hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = kvm->arch.hv_hypercall; - break; - case HV_X64_MSR_TIME_REF_COUNT: { - data = - div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); - break; - } - case HV_X64_MSR_REFERENCE_TSC: - data = kvm->arch.hv_tsc_page; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = vcpu->arch.hv_vapic; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { switch (msr_info->index) { @@ -2493,14 +2306,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr_info->index)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_get_msr_common(vcpu, + msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2651,6 +2460,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: + case KVM_CAP_SET_BOOT_CPU_ID: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3817,30 +3627,25 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { mutex_lock(&kvm->slots_lock); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_eclr); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->slots_lock); - kfree(vpic); goto create_irqchip_unlock; } } else goto create_irqchip_unlock; - smp_wmb(); - kvm->arch.vpic = vpic; - smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(kvm); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; } + /* Write kvm->irq_routing before kvm->arch.vpic. */ + smp_wmb(); + kvm->arch.vpic = vpic; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -3967,6 +3772,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_reinject(kvm, &control); break; } + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->arch.bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; case KVM_XEN_HVM_CONFIG: { r = -EFAULT; if (copy_from_user(&kvm->arch.xen_hvm_config, argp, @@ -5882,66 +5696,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - - ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - - return 1; -} - /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6518,6 +6272,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); + if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -7540,6 +7300,17 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); + +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; +} + bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 0ca2f3e4803c..2f822cd886c2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } +static inline u64 get_kernel_ns(void) +{ + return ktime_get_boot_ns(); +} + static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8fd6f44aee83..09d3afc0a181 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -166,7 +166,6 @@ void pcibios_fixup_bus(struct pci_bus *b) { struct pci_dev *dev; - pci_read_bridge_bases(b); list_for_each_entry(dev, &b->devices, bus_list) pcibios_fixup_device_resources(dev); } @@ -673,24 +672,22 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -int pcibios_enable_device(struct pci_dev *dev, int mask) +int pcibios_alloc_irq(struct pci_dev *dev) { - int err; - - if ((err = pci_enable_resources(dev, mask)) < 0) - return err; - - if (!pci_dev_msi_enabled(dev)) - return pcibios_enable_irq(dev); - return 0; + return pcibios_enable_irq(dev); } -void pcibios_disable_device (struct pci_dev *dev) +void pcibios_free_irq(struct pci_dev *dev) { - if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + if (pcibios_disable_irq) pcibios_disable_irq(dev); } +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + return pci_enable_resources(dev, mask); +} + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 9a2b7101ae8a..e58565556703 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -62,19 +62,6 @@ static void pci_fixup_umc_ide(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); -static void pci_fixup_ncr53c810(struct pci_dev *d) -{ - /* - * NCR 53C810 returns class code 0 (at least on some systems). - * Fix class to be PCI_CLASS_STORAGE_SCSI - */ - if (!d->class) { - dev_warn(&d->dev, "Fixing NCR 53C810 class code\n"); - d->class = PCI_CLASS_STORAGE_SCSI << 8; - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); - static void pci_fixup_latency(struct pci_dev *d) { /* diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 27062303c881..22aaefb4f1ca 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -211,7 +211,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) struct irq_alloc_info info; int polarity; - if (dev->irq_managed && dev->irq > 0) + if (pci_has_managed_irq(dev)) return 0; if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) @@ -234,10 +234,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && - dev->irq > 0) { + if (pci_has_managed_irq(dev)) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; + /* + * Don't reset dev->irq here, otherwise + * intel_mid_pci_irq_enable() will fail on next call. + */ } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9bd115484745..32e70343e6fd 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev) struct pci_dev *temp_dev; int irq; - if (dev->irq_managed && dev->irq > 0) + if (pci_has_managed_irq(dev)) return 0; irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, @@ -1230,8 +1230,7 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - dev->irq_managed = 1; - dev->irq = irq; + pci_set_managed_irq(dev, irq); dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); return 0; @@ -1257,24 +1256,10 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } -bool mp_should_keep_irq(struct device *dev) -{ - if (dev->power.is_prepared) - return true; -#ifdef CONFIG_PM - if (dev->power.runtime_status == RPM_SUSPENDING) - return true; -#endif - - return false; -} - static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && - dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) { mp_unmap_irq(dev->irq); - dev->irq = 0; - dev->irq_managed = 0; + pci_reset_managed_irq(dev); } } |