diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-07-30 11:15:13 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-07-30 11:15:13 +0200 |
commit | f5db340f19f14a8df9dfd22d71fba1513e9f1f7e (patch) | |
tree | 131d3345bc987aee3c922624de816492e7f323a4 /arch/x86 | |
parent | ee438ec8f33c5af0d4a4ffb935c5b9272e8c2680 (diff) | |
parent | 38115f2f8cec8087d558c062e779c443a01f87d6 (diff) | |
download | talos-obmc-linux-f5db340f19f14a8df9dfd22d71fba1513e9f1f7e.tar.gz talos-obmc-linux-f5db340f19f14a8df9dfd22d71fba1513e9f1f7e.zip |
Merge branch 'perf/urgent' into perf/core, to pick up latest fixes and refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
124 files changed, 2277 insertions, 1251 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e767ed24aeb4..781521b7cf9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY @@ -50,15 +50,18 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -72,6 +75,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE @@ -159,6 +163,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fcb7604172ce..cd20ca0b4043 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -348,6 +348,7 @@ config X86_DEBUG_FPU config PUNIT_ATOM_DEBUG tristate "ATOM Punit debug driver" + depends on PCI select DEBUG_FS select IOSF_MBI ---help--- diff --git a/arch/x86/Makefile b/arch/x86/Makefile index ad2db82e9953..1e902f926be3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -11,6 +11,14 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# For gcc stack alignment is specified with -mpreferred-stack-boundary, +# clang has the option -mstack-alignment for that purpose. +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align_opt := -mpreferred-stack-boundary +else ifneq ($(call cc-option, -mstack-alignment=4),) + cc_stack_align_opt := -mstack-alignment +endif + # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. # @@ -24,10 +32,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) + -mno-mmx -mno-sse + +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -64,8 +73,10 @@ ifeq ($(CONFIG_X86_32),y) # with nonstandard options KBUILD_CFLAGS += -fno-pic - # prevent gcc from keeping the stack 16 byte aligned - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + # Align the stack to the register width instead of using the default + # alignment of 16 bytes. This reduces stack usage and the number of + # alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -97,8 +108,14 @@ else KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) - # Use -mpreferred-stack-boundary=3 if supported. - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # By default gcc and clang use a stack alignment of 16 bytes for x86. + # However the standard kernel entry on x86-64 leaves the stack on an + # 8-byte boundary. If the compiler isn't informed about the actual + # alignment it will generate extra alignment instructions for the + # default alignment which keep the stack *mis*aligned. + # Furthermore an alignment to the register width reduces stack usage + # and the number of alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index a45eb15b7cf2..f3717d36718a 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -9,7 +9,6 @@ else tune = $(call cc-option,-mcpu=$(1),$(2)) endif -align := $(cc-option-align) cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 @@ -24,11 +23,11 @@ cflags-$(CONFIG_MK6) += -march=k6 # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d810fb15eac..d88a2fddba8c 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -73,12 +73,13 @@ UBSAN_SANITIZE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ +silent_redirect_image = >/dev/null cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(obj)/zoffset.h $@ + $(obj)/zoffset.h $@ $($(quiet)redirect_image) $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) - @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')' OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 00241c815524..a0838ab929f2 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -411,3 +411,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putstr("done.\nBooting the kernel.\n"); return output; } + +void fortify_panic(const char *name) +{ + error("detected buffer overflow"); +} diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6cf79e1a6830..0eb9f92f3717 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,5 +1,4 @@ # CONFIG_64BIT is not set -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -125,7 +124,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_MANGLE=y @@ -255,7 +253,6 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_USB_LIBUSUAL=y CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index de45f57b410d..4a4b16e56d35 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,4 +1,3 @@ -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -124,7 +123,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_MANGLE=y @@ -251,7 +249,6 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_USB_LIBUSUAL=y CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 910565547163..8739cf7795de 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -42,17 +42,15 @@ #define R5E %esi #define R6 %rdi #define R6E %edi -#define R7 %rbp -#define R7E %ebp +#define R7 %r9 /* don't use %rbp; it breaks stack traces */ +#define R7E %r9d #define R8 %r8 -#define R9 %r9 #define R10 %r10 #define R11 %r11 -#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC); \ movq r1,r2; \ - movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ @@ -70,9 +68,8 @@ je B192; \ leaq 32(r9),r9; -#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ movq r1,r2; \ - movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ @@ -88,12 +85,12 @@ movl TAB(,r6,4),r6 ## E; \ roll $16,r2 ## E; \ shrl $16,r4 ## E; \ - movzbl r4 ## H,r7 ## E; \ - movzbl r4 ## L,r4 ## E; \ + movzbl r4 ## L,r7 ## E; \ + movzbl r4 ## H,r4 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r7,4),r5 ## E;\ - xorl TAB+2048(,r4,4),r6 ## E;\ + xorl TAB+3072(,r4,4),r5 ## E;\ + xorl TAB+2048(,r7,4),r6 ## E;\ movzbl r1 ## L,r7 ## E; \ movzbl r1 ## H,r4 ## E; \ movl TAB+1024(,r4,4),r4 ## E;\ @@ -101,19 +98,19 @@ roll $16,r1 ## E; \ shrl $16,r3 ## E; \ xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## H,r7 ## E; \ - movzbl r3 ## L,r3 ## E; \ - xorl TAB+3072(,r7,4),r4 ## E;\ - xorl TAB+2048(,r3,4),r5 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r3 ## E; \ + movzbl r3 ## L,r7 ## E; \ + movzbl r3 ## H,r3 ## E; \ + xorl TAB+3072(,r3,4),r4 ## E;\ + xorl TAB+2048(,r7,4),r5 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r3 ## E; \ shrl $16,r1 ## E; \ - xorl TAB+3072(,r7,4),r6 ## E;\ - movl TAB+2048(,r3,4),r3 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r1 ## E; \ - xorl TAB+1024(,r7,4),r6 ## E;\ - xorl TAB(,r1,4),r3 ## E; \ + xorl TAB+3072(,r3,4),r6 ## E;\ + movl TAB+2048(,r7,4),r3 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r1 ## E; \ + xorl TAB+1024(,r1,4),r6 ## E;\ + xorl TAB(,r7,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ shrl $16,r2 ## E; \ @@ -131,9 +128,9 @@ movl r4 ## E,r2 ## E; #define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) -#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3c465184ff8a..16627fec80b2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + .text @@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i + pxor \XMM2, \XMM2 -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation - -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + pxor \XMM2, \XMM2 + + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some PT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -1454,18 +1549,35 @@ _return_T_decrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_decrypt - cmp $12, %r11 - je _T_12_decrypt + cmp $8, %r11 + jl _T_4_decrypt _T_8_decrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_decrypt -_T_12_decrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_4_decrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_123_decrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_decrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_decrypt + add $2, %r10 + sar $16, %eax +_T_1_decrypt: + mov %al, (%r10) jmp _return_T_done_decrypt _T_16_decrypt: movdqu %xmm0, (%r10) @@ -1718,18 +1830,35 @@ _return_T_encrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_encrypt - cmp $12, %r11 - je _T_12_encrypt + cmp $8, %r11 + jl _T_4_encrypt _T_8_encrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_encrypt -_T_12_encrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_4_encrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_123_encrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_encrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_encrypt + add $2, %r10 + sar $16, %eax +_T_1_encrypt: + mov %al, (%r10) jmp _return_T_done_encrypt _T_16_encrypt: movdqu %xmm0, (%r10) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index d664382c6e56..faecb1518bf8 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -155,6 +155,30 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + + .text @@ -372,41 +396,72 @@ VARIABLE_OFFSET = 16*8 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -480,7 +535,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -1427,19 +1481,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: @@ -1631,41 +1702,73 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -1740,7 +1843,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -2702,19 +2804,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 93de8ea51548..4a55cdcdc008 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -61,6 +61,11 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; +struct generic_gcmaes_ctx { + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +}; + struct aesni_xts_ctx { u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; @@ -102,13 +107,11 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, * u8 *out, Ciphertext output. Encrypt in-place is allowed. * const u8 *in, Plaintext input * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this - * is going to be 8 or 12 bytes + * unsigned long aad_len, Length of AAD in bytes. * u8 *auth_tag, Authenticated Tag output. * unsigned long auth_tag_len), Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. @@ -123,9 +126,8 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, * u8 *out, Plaintext output. Decrypt in-place is allowed. * const u8 *in, Ciphertext input * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going @@ -275,6 +277,16 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) align = 1; return PTR_ALIGN(crypto_aead_ctx(tfm), align); } + +static inline struct +generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) +{ + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); +} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -712,32 +724,34 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int helper_rfc4106_encrypt(struct aead_request *req) +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -768,7 +782,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); @@ -791,37 +805,20 @@ static int helper_rfc4106_encrypt(struct aead_request *req) return 0; } -static int helper_rfc4106_decrypt(struct aead_request *req) +static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; unsigned long tempCipherLen = 0; - __be32 counter = cpu_to_be32(1); - int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); u8 authTag[16]; struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ + int retval = 0; tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -838,7 +835,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) scatterwalk_start(&dst_sg_walk, req->dst); dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); @@ -850,9 +846,10 @@ static int helper_rfc4106_decrypt(struct aead_request *req) dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, authTag, auth_tag_len); kernel_fpu_end(); @@ -875,6 +872,60 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kfree(assoc); } return retval; + +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + __be32 counter = cpu_to_be32(1); + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 16 or 20 bytes */ + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); } static int rfc4106_encrypt(struct aead_request *req) @@ -1035,6 +1086,46 @@ struct { }; #ifdef CONFIG_X86_64 +static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) +{ + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +} + +static int generic_gcmaes_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + __be32 counter = cpu_to_be32(1); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + +static int generic_gcmaes_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + static struct aead_alg aesni_aead_algs[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, @@ -1069,6 +1160,23 @@ static struct aead_alg aesni_aead_algs[] = { { .cra_ctxsize = sizeof(struct cryptd_aead *), .cra_module = THIS_MODULE, }, +}, { + .setkey = generic_gcmaes_set_key, + .setauthsize = generic_gcmaes_set_authsize, + .encrypt = generic_gcmaes_encrypt, + .decrypt = generic_gcmaes_decrypt, + .ivsize = 12, + .maxauthsize = 16, + .base = { + .cra_name = "gcm(aes)", + .cra_driver_name = "generic-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, } }; #else static struct aead_alg aesni_aead_algs[0]; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 24ac9fad832d..d61e57960fe0 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -176,9 +176,6 @@ __glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, src -= 1; dst -= 1; } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; } } diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index fc61739150e7..f960a043cdeb 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 2dd3674b5a1e..458409b7568d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -269,19 +269,19 @@ static struct sha512_hash_ctx * LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; - return ctx; + goto unlock; } if (ctx->status & HASH_CTX_STS_PROCESSING) { /* Cannot submit to a currently processing job. */ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; - return ctx; + goto unlock; } if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; - return ctx; + goto unlock; } @@ -363,6 +363,7 @@ static struct sha512_hash_ctx } ctx = sha512_ctx_mgr_resubmit(mgr, ctx); +unlock: spin_unlock_irqrestore(&cstate->work_lock, irqflags); return ctx; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 139ad7726e10..726355ce8497 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -78,9 +78,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, if (image->size != new_size) return -EINVAL; - if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) - return -EFAULT; - vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index ff1ea2fb9705..8e3db8f642a7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -191,8 +191,8 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_fail, val_new= ~0; - int i, reg, reg_fail, ret = 0; + u64 val, val_fail = -1, val_new= ~0; + int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ede97710c2f4..98b0f0729527 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3397,10 +3397,8 @@ static void intel_pmu_cpu_dying(int cpu) static void intel_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.pebs_active) - intel_pmu_pebs_sched_task(ctx, sched_in); - if (x86_pmu.lbr_nr) - intel_pmu_lbr_sched_task(ctx, sched_in); + intel_pmu_pebs_sched_task(ctx, sched_in); + intel_pmu_lbr_sched_task(ctx, sched_in); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6dc8a59e1bfb..a322fed5f8ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -606,12 +606,6 @@ static inline void intel_pmu_drain_pebs_buffer(void) x86_pmu.drain_pebs(®s); } -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) -{ - if (!sched_in) - intel_pmu_drain_pebs_buffer(); -} - /* * PEBS */ @@ -822,6 +816,14 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!sched_in && pebs_needs_sched_cb(cpuc)) + intel_pmu_drain_pebs_buffer(); +} + static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 0edda489cf36..8a6bbacd17dc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -383,8 +383,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; + if (!cpuc->lbr_users) + return; + /* * If LBR callstack feature is enabled and the stack was saved when * the task was scheduled out, restore the stack. Otherwise flush diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index dae2fedc1601..4f9127644b80 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -316,7 +316,7 @@ #define SKX_UPI_PCI_PMON_CTL0 0x350 #define SKX_UPI_PCI_PMON_CTR0 0x318 #define SKX_UPI_PCI_PMON_BOX_CTL 0x378 -#define SKX_PMON_CTL_UMASK_EXT 0xff +#define SKX_UPI_CTL_UMASK_EXT 0xffefff /* SKX M2M */ #define SKX_M2M_PCI_PMON_CTL0 0x228 @@ -328,7 +328,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -351,7 +351,6 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); @@ -3302,7 +3301,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_inv.attr, &format_attr_thresh8.attr, &format_attr_filter_tid4.attr, - &format_attr_filter_link4.attr, &format_attr_filter_state5.attr, &format_attr_filter_rem.attr, &format_attr_filter_loc.attr, @@ -3312,7 +3310,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_filter_opc_0.attr, &format_attr_filter_opc_1.attr, &format_attr_filter_nc.attr, - &format_attr_filter_c6.attr, &format_attr_filter_isoc.attr, NULL, }; @@ -3333,8 +3330,11 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), + EVENT_EXTRA_END }; static u64 skx_cha_filter_mask(int fields) @@ -3347,6 +3347,17 @@ static u64 skx_cha_filter_mask(int fields) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK; if (fields & 0x4) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) { + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC; + } return mask; } @@ -3492,6 +3503,26 @@ static struct intel_uncore_type skx_uncore_irp = { .format_group = &skx_uncore_format_group, }; +static struct attribute *skx_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge_det.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute_group skx_uncore_pcu_format_group = { + .name = "format", + .attrs = skx_uncore_pcu_formats_attr, +}; + static struct intel_uncore_ops skx_uncore_pcu_ops = { IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), .hw_config = hswep_pcu_hw_config, @@ -3510,7 +3541,7 @@ static struct intel_uncore_type skx_uncore_pcu = { .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, .num_shared_regs = 1, .ops = &skx_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, + .format_group = &skx_uncore_pcu_format_group, }; static struct intel_uncore_type *skx_msr_uncores[] = { @@ -3603,8 +3634,8 @@ static struct intel_uncore_type skx_uncore_upi = { .perf_ctr_bits = 48, .perf_ctr = SKX_UPI_PCI_PMON_CTR0, .event_ctl = SKX_UPI_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .event_mask_ext = SKX_PMON_CTL_UMASK_EXT, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_UPI_CTL_UMASK_EXT, .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &skx_upi_uncore_format_group, diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 39e702d90cdb..aa6b2023d8f8 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -35,7 +35,7 @@ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"a\"\n" \ + ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ "\t.word %c1" "\t# bug_entry::line\n" \ @@ -52,7 +52,7 @@ do { \ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"a\"\n" \ + ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 24118c0b4640..5343c19814b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -116,7 +116,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2701e5f8145b..ca3c48c0872f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -286,6 +286,7 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 08a0838b83fb..398c79889f5c 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -19,8 +19,6 @@ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) #endif -#define DMA_ERROR_CODE 0 - extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -35,9 +33,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define arch_dma_alloc_attrs arch_dma_alloc_attrs -#define HAVE_ARCH_DMA_SUPPORTED 1 -extern int dma_supported(struct device *hwdev, u64 mask); - extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, unsigned long attrs); diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 8e0f8b85b209..a504adc661a4 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -4,6 +4,7 @@ #include <asm/e820/types.h> extern struct e820_table *e820_table; +extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index d2ff779f347e..796ff6c1aa53 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -33,7 +33,7 @@ #ifdef CONFIG_X86_32 -extern unsigned long asmlinkage efi_call_phys(void *, ...); +extern asmlinkage unsigned long efi_call_phys(void *, ...); #define arch_efi_call_virt_setup() kernel_fpu_begin() #define arch_efi_call_virt_teardown() kernel_fpu_end() @@ -52,7 +52,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define EFI_LOADER_SIGNATURE "EL64" -extern u64 asmlinkage efi_call(void *fp, ...); +extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e8ab9a46bc68..1c18d83d3f09 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -245,12 +245,13 @@ extern int force_personality32; #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ + 0x100000000UL) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3a106165e03a..535af0f2d8ac 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7afb0e2f07f4..48febf07e828 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -328,13 +328,13 @@ static inline unsigned type in##bwl##_p(int port) \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ + : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ + : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ } BUILDIO(b, b, char) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 793869879464..fca144a104e4 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -6,6 +6,8 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; +int x86_dma_supported(struct device *dev, u64 mask); + /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 34b984c60790..6cf65437b5e5 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -52,10 +52,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern __visible kprobe_opcode_t optprobe_template_entry; -extern __visible kprobe_opcode_t optprobe_template_val; -extern __visible kprobe_opcode_t optprobe_template_call; -extern __visible kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry[]; +extern __visible kprobe_opcode_t optprobe_template_val[]; +extern __visible kprobe_opcode_t optprobe_template_call[]; +extern __visible kprobe_opcode_t optprobe_template_end[]; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 722d0e568863..fde36f189836 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -23,6 +23,7 @@ struct x86_exception { u16 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ + u8 async_page_fault; }; /* diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 695605eb1dfb..87ac4fba6d8e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,28 +48,31 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS /* x86-specific vcpu->requests bit members */ -#define KVM_REQ_MIGRATE_TIMER 8 -#define KVM_REQ_REPORT_TPR_ACCESS 9 -#define KVM_REQ_TRIPLE_FAULT 10 -#define KVM_REQ_MMU_SYNC 11 -#define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_EVENT 14 -#define KVM_REQ_APF_HALT 15 -#define KVM_REQ_STEAL_UPDATE 16 -#define KVM_REQ_NMI 17 -#define KVM_REQ_PMU 18 -#define KVM_REQ_PMI 19 -#define KVM_REQ_SMI 20 -#define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_HV_CRASH 26 -#define KVM_REQ_IOAPIC_EOI_EXIT 27 -#define KVM_REQ_HV_RESET 28 -#define KVM_REQ_HV_EXIT 29 -#define KVM_REQ_HV_STIMER 30 +#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) +#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) +#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) +#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) +#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_EVENT KVM_ARCH_REQ(6) +#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) +#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) +#define KVM_REQ_NMI KVM_ARCH_REQ(9) +#define KVM_REQ_PMU KVM_ARCH_REQ(10) +#define KVM_REQ_PMI KVM_ARCH_REQ(11) +#define KVM_REQ_SMI KVM_ARCH_REQ(12) +#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) +#define KVM_REQ_MCLOCK_INPROGRESS \ + KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC \ + KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) +#define KVM_REQ_APIC_PAGE_RELOAD \ + KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) +#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) +#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) +#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) +#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -254,7 +257,8 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; - unsigned :8; + unsigned ad_disabled:1; + unsigned :7; /* * This is left at the top of the word so that @@ -458,10 +462,12 @@ struct kvm_vcpu_hv_synic { DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; + bool dont_zero_synic_pages; }; /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; @@ -545,6 +551,7 @@ struct kvm_vcpu_arch { bool reinject; u8 nr; u32 error_code; + u8 nested_apf; } exception; struct kvm_queued_interrupt { @@ -645,6 +652,9 @@ struct kvm_vcpu_arch { u64 msr_val; u32 id; bool send_user_only; + u32 host_apf_reason; + unsigned long nested_apf_token; + bool delivery_as_pf_vmexit; } apf; /* OSVW MSRs (AMD only) */ @@ -799,6 +809,7 @@ struct kvm_arch { int audit_point; #endif + bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; @@ -948,9 +959,7 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); void (*set_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject); + void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ecfcb6643c9b..265c907d7d4c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -293,7 +293,7 @@ static inline unsigned long __get_current_cr3_fast(void) unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); /* For now, be very restrictive about when this can be called. */ - VM_WARN_ON(in_nmi() || !in_atomic()); + VM_WARN_ON(in_nmi() || preemptible()); VM_BUG_ON(cr3 != __read_cr3()); return cr3; diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index d5acc27ed1cc..2b58c8c1eeaa 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -136,7 +136,6 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } -#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick) #define hv_init_timer(timer, tick) wrmsrl(timer, tick) #define hv_init_timer_config(config, val) wrmsrl(config, val) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 18b162322eff..5573c75f8e4c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -251,9 +251,13 @@ #define HWP_MIN_PERF(x) (x & 0xff) #define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) -#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_EPP_PERFORMANCE 0x00 +#define HWP_EPP_BALANCE_PERFORMANCE 0x80 +#define HWP_EPP_BALANCE_POWERSAVE 0xC0 +#define HWP_EPP_POWERSAVE 0xFF +#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -422,6 +426,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) @@ -476,9 +482,11 @@ #define MSR_MISC_PWR_MGMT 0x000001aa #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 -#define ENERGY_PERF_BIAS_PERFORMANCE 0 -#define ENERGY_PERF_BIAS_NORMAL 6 -#define ENERGY_PERF_BIAS_POWERSAVE 15 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8 +#define ENERGY_PERF_BIAS_POWERSAVE 15 #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index cb976bab6299..9ffc36bfe4cd 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -84,7 +84,7 @@ struct pv_init_ops { */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); -}; +} __no_randomize_layout; struct pv_lazy_ops { @@ -92,12 +92,12 @@ struct pv_lazy_ops { void (*enter)(void); void (*leave)(void); void (*flush)(void); -}; +} __no_randomize_layout; struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); -}; +} __no_randomize_layout; struct pv_cpu_ops { /* hooks for various privileged instructions */ @@ -176,7 +176,7 @@ struct pv_cpu_ops { void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); -}; +} __no_randomize_layout; struct pv_irq_ops { /* @@ -199,7 +199,7 @@ struct pv_irq_ops { #ifdef CONFIG_X86_64 void (*adjust_exception_frame)(void); #endif -}; +} __no_randomize_layout; struct pv_mmu_ops { unsigned long (*read_cr2)(void); @@ -305,7 +305,7 @@ struct pv_mmu_ops { an mfn. We can tell which is which from the index. */ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); -}; +} __no_randomize_layout; struct arch_spinlock; #ifdef CONFIG_SMP @@ -322,7 +322,7 @@ struct pv_lock_ops { void (*kick)(int cpu); struct paravirt_callee_save vcpu_is_preempted; -}; +} __no_randomize_layout; /* This contains all the paravirt structures: we get a convenient * number for each function using the offset which we use to indicate @@ -334,7 +334,7 @@ struct paravirt_patch_template { struct pv_irq_ops pv_irq_ops; struct pv_mmu_ops pv_mmu_ops; struct pv_lock_ops pv_lock_ops; -}; +} __no_randomize_layout; extern struct pv_info pv_info; extern struct pv_init_ops pv_init_ops; diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0b1ff4c1c14e..fffb2794dd89 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,6 +7,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h deleted file mode 100644 index 0ff8fe71b255..000000000000 --- a/arch/x86/include/asm/pmem.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __ASM_X86_PMEM_H__ -#define __ASM_X86_PMEM_H__ - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/cpufeature.h> -#include <asm/special_insns.h> - -#ifdef CONFIG_ARCH_HAS_PMEM_API -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent pmem driver flush operation will drain posted write queues. - */ -static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) -{ - int rem; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n); - if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, rem)) - BUG(); -} - -/** - * arch_wb_cache_pmem - write back a cache range with CLWB - * @vaddr: virtual start address - * @size: number of bytes to write back - * - * Write back a cache range using the CLWB (cache line write back) - * instruction. Note that @size is internally rounded up to be cache - * line size aligned. - */ -static inline void arch_wb_cache_pmem(void *addr, size_t size) -{ - u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; - unsigned long clflush_mask = x86_clflush_size - 1; - void *vend = addr + size; - void *p; - - for (p = (void *)((unsigned long)addr & ~clflush_mask); - p < vend; p += x86_clflush_size) - clwb(p); -} - -/** - * arch_copy_from_iter_pmem - copy data from an iterator to PMEM - * @addr: PMEM destination address - * @bytes: number of bytes to copy - * @i: iterator with source data - * - * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. - */ -static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, - struct iov_iter *i) -{ - size_t len; - - /* TODO: skip the write-back by always using non-temporal stores */ - len = copy_from_iter_nocache(addr, bytes, i); - - /* - * In the iovec case on x86_64 copy_from_iter_nocache() uses - * non-temporal stores for the bulk of the transfer, but we need - * to manually flush if the transfer is unaligned. A cached - * memory copy is used when destination or size is not naturally - * aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - * - * In the non-iovec case the entire destination needs to be - * flushed. - */ - if (iter_is_iovec(i)) { - unsigned long flushed, dest = (unsigned long) addr; - - if (bytes < 8) { - if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, bytes); - } else { - if (!IS_ALIGNED(dest, 8)) { - dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); - arch_wb_cache_pmem(addr, 1); - } - - flushed = dest - (unsigned long) addr; - if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) - arch_wb_cache_pmem(addr + bytes - 1, 1); - } - } else - arch_wb_cache_pmem(addr, bytes); - - return len; -} - -/** - * arch_clear_pmem - zero a PMEM memory range - * @addr: virtual start address - * @size: number of bytes to zero - * - * Write zeros into the memory range starting at 'addr' for 'size' bytes. - */ -static inline void arch_clear_pmem(void *addr, size_t size) -{ - memset(addr, 0, size); - arch_wb_cache_pmem(addr, size); -} - -static inline void arch_invalidate_pmem(void *addr, size_t size) -{ - clflush_cache_range(addr, size); -} -#endif /* CONFIG_ARCH_HAS_PMEM_API */ -#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a79547e8ee0..028245e1c42b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -129,7 +129,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -}; +} __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index dcbd9bcce714..8abedf1d650e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -74,6 +74,7 @@ static __always_inline void boot_init_stack_canary(void) get_random_bytes(&canary, sizeof(canary)); tsc = rdtsc(); canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; current->stack_canary = canary; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e8353ee5c..e9ee84873de5 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -142,7 +142,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, } #define __HAVE_ARCH_MEMCPY +extern void *memcpy(void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #ifdef CONFIG_X86_USE_3DNOW #include <asm/mmx.h> @@ -195,11 +197,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) #endif #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t n); +extern int memcmp(const void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #define memcmp __builtin_memcmp +#endif #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); @@ -321,6 +327,8 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +extern void *memset(void *, int, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #if (__GNUC__ >= 4) #define memset(s, c, count) __builtin_memset(s, c, count) #else @@ -330,6 +338,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, (count)) \ : __memset((s), (c), (count))) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ /* * find the first occurrence of byte 'c', or 1 past the area if none diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 733bae07fb29..2a8c822de1fc 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -31,6 +31,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); +#ifndef CONFIG_FORTIFY_SOURCE #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ @@ -51,6 +52,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); */ #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); @@ -77,6 +79,11 @@ int strcmp(const char *cs, const char *ct); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #define __HAVE_ARCH_MEMCPY_MCSAFE 1 @@ -109,6 +116,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) return 0; } +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 6136a18152af..2bd96b4df140 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -42,8 +42,7 @@ struct saved_context { set_debugreg((thread)->debugreg##register, register) /* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern char core_restore_code; -extern char restore_registers; +extern char core_restore_code[]; +extern char restore_registers[]; #endif /* _ASM_X86_SUSPEND_64_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 14824fc78f7e..58fffe79e417 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -83,7 +83,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 event_inj; u32 event_inj_err; u64 nested_cr3; - u64 lbr_ctl; + u64 virt_ext; u32 clean; u32 reserved_5; u64 next_rip; @@ -119,6 +119,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a059aac9e937..30269dafec47 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -535,9 +535,6 @@ struct __large_struct { unsigned long buf[100]; }; #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user - /* * {get|put}_user_try and catch * @@ -565,7 +562,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); -extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c5504b9a472e..b16f6a1d8b26 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); +extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); +extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len); + static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) @@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, return __copy_user_nocache(dst, src, size, 0); } +static inline int +__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} + unsigned long copy_user_handle_tail(char *to, char *from, unsigned len); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index f6d20f6cca12..11071fcd630e 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -43,6 +43,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/smap.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> @@ -50,6 +51,8 @@ #include <xen/interface/platform.h> #include <xen/interface/xen-mca.h> +struct xen_dm_op_buf; + /* * The hypercall asms have to meet several constraints: * - Work on 32- and 64-bit. @@ -214,10 +217,12 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + stac(); asm volatile("call *%[call]" : __HYPERCALL_5PARAM : [call] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); + clac(); return (long)__res; } @@ -474,9 +479,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) static inline int HYPERVISOR_dm_op( - domid_t dom, unsigned int nr_bufs, void *bufs) + domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { - return _hypercall3(int, dm_op, dom, nr_bufs, bufs); + int ret; + stac(); + ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); + clac(); + return ret; } static inline void diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 83b6e9a0dce4..da1489cb64dc 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,6 +1,6 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h +generated-y += unistd_32.h +generated-y += unistd_64.h +generated-y += unistd_x32.h diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f4fef5a24ebd..127ddadee1a5 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -150,6 +150,12 @@ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) /* + * HV_VP_SET available + */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + + +/* * Crash notification flag. */ #define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index cff0bb6556f8..a965e5b0d328 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -67,6 +67,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) +#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6bb680671088..7491e73d9253 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -347,6 +347,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, struct mpc_intsrc mp_irq; /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8233a630280f..dde437f5d14f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -167,7 +167,8 @@ static int __init ffh_cstate_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - if (c->x86_vendor != X86_VENDOR_INTEL) + if (c->x86_vendor != X86_VENDOR_INTEL && + c->x86_vendor != X86_VENDOR_AMD) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 815dd63f49d0..cc0e8bc0ea3f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -704,6 +704,7 @@ static const struct dma_map_ops gart_dma_ops = { .alloc = gart_alloc_coherent, .free = gart_free_coherent, .mapping_error = gart_mapping_error, + .dma_supported = x86_dma_supported, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include <linux/init.h> #include <linux/delay.h> -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b4f5f73febdb..237e9c2341c7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2093,7 +2093,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic_irq(irq_get_chip_data(0)); + unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 52000010c62e..cdf82492b770 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -21,6 +21,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o +obj-$(CONFIG_CPU_FREQ) += aperfmperf.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb5abe8f5fd4..3b9e220621f8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -134,6 +134,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; + OPTIMIZER_HIDE_VAR(f_vide); d = rdtsc(); while (n--) f_vide(); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000000..d869c8671e36 --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,79 @@ +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown <len.brown@intel.com> + * + * This file is licensed under GPLv2. + */ + +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/smp.h> + +struct aperfmperf_sample { + unsigned int khz; + unsigned long jiffies; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = this_cpu_ptr(&samples); + + /* Don't bother re-computing within 10 ms */ + if (time_before(jiffies, s->jiffies + HZ/100)) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* + * There is no architectural guarantee that MPERF + * increments faster than we can read it. + */ + if (mperf_delta == 0) + return; + + /* + * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then + * khz = (cpu_khz * aperf_delta) / mperf_delta + */ + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + else /* khz = aperf_delta / (mperf_delta / cpu_khz) */ + s->khz = div64_u64(aperf_delta, + div64_u64(mperf_delta, cpu_khz)); + s->jiffies = jiffies; + s->aperf = aperf; + s->mperf = mperf; +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 6df621ae62a7..218f79825b3c 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -2,7 +2,6 @@ #include <linux/timex.h> #include <linux/string.h> #include <linux/seq_file.h> -#include <linux/cpufreq.h> /* * Get CPU information for use by the procfs. @@ -76,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); - if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get(cpu); - - if (!freq) - freq = cpu_khz; + if (cpu_has(c, X86_FEATURE_TSC)) seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); - } + cpu_khz / 1000, (cpu_khz % 1000)); /* Cache size */ if (c->x86_cache_size >= 0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3fe45f84ced4..cbf1f6ba39a8 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -235,8 +235,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from node %s.\n", - dn->full_name); + printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d78a586ba8dc..532da61d605c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -20,10 +20,12 @@ #include <asm/setup.h> /* - * We organize the E820 table into two main data structures: + * We organize the E820 table into three main data structures: * * - 'e820_table_firmware': the original firmware version passed to us by the - * bootloader - not modified by the kernel. We use this to: + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap @@ -31,6 +33,14 @@ * - the hibernation code uses it to generate a kernel-independent MD5 * fingerprint of the physical memory layout of a system. * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * * - kexec, which is a bootloader in disguise, uses the original E820 * layout to pass to the kexec-ed kernel. This way the original kernel * can have a restricted E820 map while the kexec()-ed kexec-kernel @@ -46,9 +56,11 @@ * specific memory layout data during early bootup. */ static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; static struct e820_table e820_table_firmware_init __initdata; struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ @@ -470,9 +482,9 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -546,9 +558,9 @@ void __init e820__update_table_print(void) e820__print_table("modified"); } -static void __init e820__update_table_firmware(void) +static void __init e820__update_table_kexec(void) { - e820__update_table(e820_table_firmware); + e820__update_table(e820_table_kexec); } #define MAX_GAP_END 0x100000000ull @@ -623,7 +635,7 @@ __init void e820__setup_pci_gap(void) /* * Called late during init, in free_initmem(). * - * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * Initial e820_table and e820_table_kexec are largish __initdata arrays. * * Copy them to a (usually much smaller) dynamically allocated area that is * sized precisely after the number of e820 entries. @@ -643,6 +655,12 @@ __init void e820__reallocate_tables(void) memcpy(n, e820_table, size); e820_table = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_table_kexec, size); + e820_table_kexec = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); @@ -669,6 +687,9 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) __append_e820_table(extmap, entries); e820__update_table(e820_table); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + early_memunmap(sdata, data_len); pr_info("e820: extended physical RAM map:\n"); e820__print_table("extended"); @@ -727,7 +748,7 @@ core_initcall(e820__register_nvs_regions); /* * Allocate the requested number of bytes with the requsted alignment * and return (the physical address) to the caller. Also register this - * range in the 'firmware' E820 table as a reserved range. + * range in the 'kexec' E820 table as a reserved range. * * This allows kexec to fake a new mptable, as if it came from the real * system. @@ -738,9 +759,9 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); - pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); - e820__update_table_firmware(); + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); } return addr; @@ -923,13 +944,13 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); pa_data = data->next; early_memunmap(data, sizeof(*data)); } e820__update_table(e820_table); - - memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); @@ -1062,6 +1083,7 @@ void __init e820__reserve_resources(void) res++; } + /* Expose the bootloader-provided memory layout to the sysfs. */ for (i = 0; i < e820_table_firmware->nr_entries; i++) { struct e820_entry *entry = e820_table_firmware->entries + i; @@ -1175,6 +1197,7 @@ void __init e820__memory_setup(void) who = x86_init.resources.memory_setup(); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); pr_info("e820: BIOS-provided physical RAM map:\n"); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 9d7fd5e6689a..fb095ba0c02f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -100,14 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_table_firmware->nr_entries; + nr_e820_entries = e820_table_kexec->nr_entries; /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6b877807598b..f0153714ddac 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p) int arch_prepare_kprobe(struct kprobe *p) { + int ret; + if (alternatives_text_reserved(p->addr, p->addr)) return -EINVAL; @@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p) if (!p->ainsn.insn) return -ENOMEM; - return arch_copy_kprobe(p); + ret = arch_copy_kprobe(p); + if (ret) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } + + return ret; } void arch_arm_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 43e10d6fdbed..71c17a5be983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -330,7 +330,12 @@ static void kvm_guest_cpu_init(void) #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + pa |= KVM_ASYNC_PF_ENABLED; + + /* Async page fault support for L1 hypervisor is optional */ + if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, + (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fda7867046d0..5286a4a92cf7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -50,6 +50,8 @@ #include <asm/x86_init.h> #include <asm/iommu_table.h> +#define CALGARY_MAPPING_ERROR 0 + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; #else @@ -252,7 +254,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } } @@ -272,10 +274,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_ERROR_CODE)) { + if (unlikely(entry == CALGARY_MAPPING_ERROR)) { pr_warn("failed to allocate %u pages in iommu %p\n", npages, tbl); - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } /* set the return dma address */ @@ -295,7 +297,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely(dma_addr < badend)) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); @@ -380,7 +382,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_ERROR_CODE) { + if (entry == CALGARY_MAPPING_ERROR) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -398,7 +400,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_ERROR_CODE; + sg->dma_address = CALGARY_MAPPING_ERROR; sg->dma_length = 0; } return 0; @@ -453,7 +455,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_ERROR_CODE) + if (mapping == CALGARY_MAPPING_ERROR) goto free; *dma_handle = mapping; return ret; @@ -478,6 +480,11 @@ static void calgary_free_coherent(struct device *dev, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } +static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == CALGARY_MAPPING_ERROR; +} + static const struct dma_map_ops calgary_dma_ops = { .alloc = calgary_alloc_coherent, .free = calgary_free_coherent, @@ -485,6 +492,8 @@ static const struct dma_map_ops calgary_dma_ops = { .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, .unmap_page = calgary_unmap_page, + .mapping_error = calgary_mapping_error, + .dma_supported = x86_dma_supported, }; static inline void __iomem * busno_to_bbar(unsigned char num) @@ -732,7 +741,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3a216ec869cd..5e16d3f29594 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -213,10 +213,8 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int dma_supported(struct device *dev, u64 mask) +int x86_dma_supported(struct device *dev, u64 mask) { - const struct dma_map_ops *ops = get_dma_ops(dev); - #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -224,9 +222,6 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (ops->dma_supported) - return ops->dma_supported(dev, mask); - /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ @@ -252,7 +247,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; } -EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a88952ef371c..a6d404087fe3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -11,6 +11,8 @@ #include <asm/iommu.h> #include <asm/dma.h> +#define NOMMU_MAPPING_ERROR 0 + static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { @@ -33,7 +35,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return DMA_ERROR_CODE; + return NOMMU_MAPPING_ERROR; flush_write_buffers(); return bus; } @@ -88,6 +90,11 @@ static void nommu_sync_sg_for_device(struct device *dev, flush_write_buffers(); } +static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == NOMMU_MAPPING_ERROR; +} + const struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, .free = dma_generic_free_coherent, @@ -96,4 +103,6 @@ const struct dma_map_ops nommu_dma_ops = { .sync_single_for_device = nommu_sync_single_for_device, .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, + .mapping_error = nommu_mapping_error, + .dma_supported = x86_dma_supported, }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 65622f07e633..3486d0498800 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1076,6 +1076,13 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 760433b2574a..2688c7dc5323 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,7 +22,7 @@ config KVM depends on HAVE_KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET + depends on NET && MULTIUSER select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a6fd40aade7c..da6728383052 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 80890dee66ce..fb0055953fbc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ebae57ac5902..337b6d2730fa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, return 0; } -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (vpidx < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, vpidx); + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + return NULL; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); + vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; synic = vcpu_to_synic(vcpu); @@ -221,7 +234,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic->version = data; break; case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -232,7 +246,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -318,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -341,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) kvm_hv_notify_acked_sint(vcpu, i); } -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -634,9 +649,10 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) } if ((stimer->config & HV_STIMER_ENABLE) && - stimer->count) - stimer_start(stimer); - else + stimer->count) { + if (!stimer->msg_pending) + stimer_start(stimer); + } else stimer_cleanup(stimer); } } @@ -687,14 +703,24 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); +} + +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + /* * Hyper-V SynIC auto EOI SINT's are * not compatible with APICV, so deactivate APICV */ kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; + synic->active = true; + synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; } @@ -978,6 +1004,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { + case HV_X64_MSR_VP_INDEX: + if (!host) + return 1; + hv->vp_index = (u32)data; + break; case HV_X64_MSR_APIC_ASSIST_PAGE: { u64 gfn; unsigned long addr; @@ -1089,18 +1120,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } + case HV_X64_MSR_VP_INDEX: + data = hv->vp_index; break; - } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index cd1119538add..e637631a9574 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -56,9 +56,10 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a78b445ce411..af192895b1fc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -724,8 +724,10 @@ void kvm_free_pit(struct kvm *kvm) struct kvm_pit *pit = kvm->arch.vpit; if (pit) { + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d24c8742d9b0..2819d4c123eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(!apic->lapic_timer.hv_timer_in_use); preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; @@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic) static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + if (!kvm_x86_ops->set_hv_timer) + return false; + + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; + + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + if (!start_hv_timer(apic)) + start_sw_timer(apic); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) @@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..29caa2c3dff9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb8225969255..9b1dd114956a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> +#include "trace.h" /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -183,13 +184,13 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; /* - * The mask/value to distinguish a PTE that has been marked not-present for - * access tracking purposes. - * The mask would be either 0 if access tracking is disabled, or - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. + * Non-present SPTEs with shadow_acc_track_value set are in place for access + * tracking. */ static u64 __read_mostly shadow_acc_track_mask; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; @@ -207,16 +208,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { + BUG_ON((mmio_mask & mmio_value) != mmio_value); + shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return !(spte & shadow_acc_track_value); +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + static inline bool is_access_track_spte(u64 spte) { - /* Always false if shadow_acc_track_mask is zero. */ - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } /* @@ -270,7 +295,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -278,7 +303,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_mask; + return (spte & shadow_mmio_mask) == shadow_mmio_value; } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -315,12 +340,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { - if (acc_track_mask != 0) - acc_track_mask |= SPTE_SPECIAL_MASK; + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & shadow_acc_track_value); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -329,7 +362,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -549,7 +581,7 @@ static bool spte_has_volatile_bits(u64 spte) is_access_track_spte(spte)) return true; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { if ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) return true; @@ -560,14 +592,17 @@ static bool spte_has_volatile_bits(u64 spte) static bool is_accessed_spte(u64 spte) { - return shadow_accessed_mask ? spte & shadow_accessed_mask - : !is_access_track_spte(spte); + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) { - return shadow_dirty_mask ? spte & shadow_dirty_mask - : spte & PT_WRITABLE_MASK; + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -707,10 +742,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 mark_spte_for_access_track(u64 spte) { - if (shadow_accessed_mask != 0) + if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + if (is_access_track_spte(spte)) return spte; /* @@ -729,7 +764,6 @@ static u64 mark_spte_for_access_track(u64 spte) spte |= (spte & shadow_acc_track_saved_bits_mask) << shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; - spte |= shadow_acc_track_value; return spte; } @@ -741,6 +775,7 @@ static u64 restore_acc_track_spte(u64 spte) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) & shadow_acc_track_saved_bits_mask; + WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; @@ -759,7 +794,7 @@ static bool mmu_spte_age(u64 *sptep) if (!is_accessed_spte(spte)) return false; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { @@ -1390,6 +1425,22 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } +static bool wrprot_ad_disabled_spte(u64 *sptep) +{ + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); + if (was_writable) + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + + return was_writable; +} + +/* + * Gets the GFN ready for another round of dirty logging by clearing the + * - D bit on ad-enabled SPTEs, and + * - W bit on ad-disabled SPTEs. + * Returns true iff any D or W bits were cleared. + */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1397,7 +1448,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_clear_dirty(sptep); + else + flush |= wrprot_ad_disabled_spte(sptep); return flush; } @@ -1420,7 +1474,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_set_dirty(sptep); return flush; } @@ -1452,7 +1507,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } /** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write + * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about @@ -1766,18 +1822,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep; struct rmap_iterator iter; - /* - * If there's no access bit in the secondary pte set by the hardware and - * fast access tracking is also not enabled, it's up to gup-fast/gup to - * set the access bit in the primary pte or in the page structure. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return 1; -out: return 0; } @@ -1798,18 +1845,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - return kvm_handle_hva_range(kvm, start, end, 0, - kvm_unmap_rmapp); - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -2398,7 +2433,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + else + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2666,10 +2706,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte = 0; int ret = 0; + struct kvm_mmu_page *sp; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + sp = page_header(__pa(sptep)); + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, @@ -2678,7 +2723,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, */ spte |= shadow_present_mask; if (!speculative) - spte |= shadow_accessed_mask; + spte |= spte_shadow_accessed_mask(spte); if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; @@ -2735,7 +2780,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= shadow_dirty_mask; + spte |= spte_shadow_dirty_mask(spte); } if (speculative) @@ -2877,16 +2922,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; + sp = page_header(__pa(sptep)); + /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. + * Without accessed bits, there's no way to distinguish between + * actually accessed translations and prefetched, so disable pte + * prefetch if accessed bits aren't available. */ - if (!shadow_accessed_mask) + if (sp_ad_disabled(sp)) return; - sp = page_header(__pa(sptep)); if (sp->role.level > PT_PAGE_TABLE_LEVEL) return; @@ -3704,7 +3749,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) kvm_event_needs_reinjection(vcpu))) return false; - if (is_guest_mode(vcpu)) + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; return kvm_x86_ops->interrupt_allowed(vcpu); @@ -3736,6 +3781,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect) +{ + int r = 1; + + switch (vcpu->arch.apf.host_apf_reason) { + default: + trace_kvm_page_fault(fault_address, error_code); + + if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, fault_address); + r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, + insn_len); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wait(fault_address); + local_irq_enable(); + break; + case KVM_PV_REASON_PAGE_READY: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wake(fault_address); + local_irq_enable(); + break; + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_handle_page_fault); + static bool check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { @@ -4290,6 +4367,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4377,6 +4455,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; + context->base_role.ad_disabled = !accessed_dirty; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); @@ -4636,6 +4715,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 330bf3a811fb..d7d248a000dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -77,6 +77,9 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5a24b846a1cb..8b97a6cba8d1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -30,8 +30,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -39,6 +40,7 @@ access_str[role.access], \ role.invalid ? " invalid" : "", \ role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ saved_ptr; \ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 33460fcdeef9..4d8141e533c3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -190,10 +190,10 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; - u32 apf_reason; /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; @@ -276,6 +276,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* enable/disable Virtual VMLOAD VMSAVE */ +static int vls = true; +module_param(vls, int, 0444); + /* AVIC VM ID bit masks and lock */ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); static DEFINE_SPINLOCK(avic_vm_id_lock); @@ -632,11 +636,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; /* * If we are within a nested VM we'd better #VMEXIT and let the guest @@ -946,7 +952,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 1; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); @@ -957,13 +963,25 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 0; + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -1080,6 +1098,16 @@ static __init int svm_hardware_setup(void) } } + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + return 0; err: @@ -1267,6 +1295,16 @@ static void init_vmcb(struct vcpu_svm *svm) if (avic) avic_init_vmcb(svm); + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + clr_intercept(svm, INTERCEPT_VMLOAD); + clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1713,11 +1751,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -2070,34 +2121,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u64 error_code; - int r = 1; - - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; + u64 error_code = svm->vmcb->control.exit_info_1; - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } - return r; + svm->vmcb->control.insn_len, !npt_enabled); } static int db_interception(struct vcpu_svm *svm) @@ -2112,10 +2140,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2244,7 +2269,7 @@ static int io_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string; + int size, in, string, ret; unsigned port; ++svm->vcpu.stat.io_exits; @@ -2256,10 +2281,16 @@ static int io_interception(struct vcpu_svm *svm) port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); - return in ? kvm_fast_pio_in(vcpu, size, port) - : kvm_fast_pio_out(vcpu, size, port); + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + if (in) + return kvm_fast_pio_in(vcpu, size, port) && ret; + else + return kvm_fast_pio_out(vcpu, size, port) && ret; } static int nmi_interception(struct vcpu_svm *svm) @@ -2370,8 +2401,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2381,7 +2412,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2392,15 +2423,19 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, if (!is_guest_mode(&svm->vcpu)) return 0; + vmexit = nested_svm_intercept(svm); + if (vmexit != NESTED_EXIT_DONE) + return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + svm->nested.exit_required = true; return vmexit; } @@ -2534,6 +2569,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2550,7 +2610,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) return NESTED_EXIT_HOST; break; default: @@ -2589,11 +2649,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) + svm->vcpu.arch.exception.nested_apf != 0) vmexit = NESTED_EXIT_DONE; break; } @@ -2650,7 +2714,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; - dst->lbr_ctl = from->lbr_ctl; + dst->virt_ext = from->virt_ext; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -2956,7 +3020,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; @@ -3003,6 +3067,7 @@ static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3012,18 +3077,19 @@ static int vmload_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3033,12 +3099,12 @@ static int vmsave_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmrun_interception(struct vcpu_svm *svm) @@ -3071,25 +3137,29 @@ failed: static int stgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); - return 1; + return ret; } static int clgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3100,7 +3170,7 @@ static int clgi_interception(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } - return 1; + return ret; } static int invlpga_interception(struct vcpu_svm *svm) @@ -3114,8 +3184,7 @@ static int invlpga_interception(struct vcpu_svm *svm) kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int skinit_interception(struct vcpu_svm *svm) @@ -3138,7 +3207,7 @@ static int xsetbv_interception(struct vcpu_svm *svm) if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3234,8 +3303,7 @@ static int invlpg_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int emulate_on_interception(struct vcpu_svm *svm) @@ -3385,9 +3453,7 @@ static int dr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, reg, val); } - skip_emulated_instruction(&svm->vcpu); - - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -3510,6 +3576,7 @@ static int rdmsr_interception(struct vcpu_svm *svm) if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_read(ecx, msr_info.data); @@ -3518,9 +3585,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -3646,11 +3712,11 @@ static int wrmsr_interception(struct vcpu_svm *svm) if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int msr_interception(struct vcpu_svm *svm) @@ -3679,8 +3745,7 @@ static int pause_interception(struct vcpu_svm *svm) static int nop_interception(struct vcpu_svm *svm) { - skip_emulated_instruction(&(svm->vcpu)); - return 1; + return kvm_skip_emulated_instruction(&(svm->vcpu)); } static int monitor_interception(struct vcpu_svm *svm) @@ -4065,7 +4130,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); @@ -4627,10 +4692,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) + return; /* STGI will cause a vm exit */ + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4771,6 +4843,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -4890,7 +4978,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6dcc4873e435..29fd8af5c347 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,7 +198,8 @@ struct loaded_vmcs { struct vmcs *vmcs; struct vmcs *shadow_vmcs; int cpu; - int launched; + bool launched; + bool nmi_known_unmasked; struct list_head loaded_vmcss_on_cpu_link; }; @@ -913,8 +914,9 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -2325,6 +2327,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) __vmx_load_host_state(to_vmx(vcpu)); } +static bool emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2362,6 +2369,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = vmx_get_rflags(vcpu); + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { @@ -2369,6 +2378,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); + + if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) @@ -2421,28 +2433,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; - if (!(vmcs12->exception_bitmap & (1u << nr))) + if (!((vmcs12->exception_bitmap & (1u << nr)) || + (nr == PF_VECTOR && vcpu->arch.exception.nested_apf))) return 0; + if (vcpu->arch.exception.nested_apf) { + vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code); + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + vcpu->arch.apf.nested_apf_token); + return 1; + } + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; if (!reinject && is_guest_mode(vcpu) && - nested_vmx_check_exception(vcpu, nr)) + nested_vmx_check_exception(vcpu)) return; if (has_error_code) { @@ -2772,7 +2797,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept_ad_bits) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3198,7 +3223,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3280,7 +3306,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3758,6 +3788,25 @@ static void free_kvm_area(void) } } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + static void init_vmcs_shadow_fields(void) { int i, j; @@ -3783,14 +3832,22 @@ static void init_vmcs_shadow_fields(void) /* shadowed fields guest access without vmexit */ for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], - vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], - vmx_vmread_bitmap); + unsigned long field = shadow_read_write_fields[i]; + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { + clear_bit(field + 1, vmx_vmwrite_bitmap); + clear_bit(field + 1, vmx_vmread_bitmap); + } + } + for (i = 0; i < max_shadow_read_only_fields; i++) { + unsigned long field = shadow_read_only_fields[i]; + + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) + clear_bit(field + 1, vmx_vmread_bitmap); } - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], - vmx_vmread_bitmap); } static __init int alloc_kvm_area(void) @@ -3811,11 +3868,6 @@ static __init int alloc_kvm_area(void) return 0; } -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -4013,7 +4065,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); } else { vpid_sync_context(vpid); } @@ -4188,14 +4240,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp; /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - if (enable_ept_ad_bits) + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); @@ -4209,7 +4262,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(cr3); + eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); @@ -4627,6 +4680,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -5170,7 +5228,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -5457,10 +5516,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_guest_mode(vcpu)) { - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; - } + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) @@ -5474,16 +5531,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nmi_known_unmasked) + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (vmx->loaded_vmcs->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->nmi_known_unmasked = !masked; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -5656,14 +5718,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, + true); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -6220,17 +6279,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (is_guest_mode(vcpu) - && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { - /* - * Fix up exit_qualification according to whether guest - * page table accesses are reads or writes. - */ - u64 eptp = nested_ept_get_cr3(vcpu); - if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) - exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; - } - /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. @@ -6453,7 +6501,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -6557,7 +6605,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -7218,25 +7265,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - /* * Read a vmcs12 field. Since these can have varying lengths and we return * one type, we chose the biggest type (u64) and zero-extend the return value @@ -7661,7 +7689,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7691,17 +7722,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -8004,7 +8046,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) - return enable_ept; + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; @@ -8408,9 +8450,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } return 0; } @@ -8601,17 +8649,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; + + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -8690,7 +8745,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->nmi_known_unmasked) + if (vmx->loaded_vmcs->nmi_known_unmasked) return; /* * Can't use vmx->exit_intr_info since we're not sure what @@ -8714,7 +8769,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmx->nmi_known_unmasked = + vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } @@ -9394,6 +9449,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; +} + /* Callbacks for nested_ept_init_mmu_context: */ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) @@ -9404,18 +9464,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - u64 eptp; + bool wants_ad; WARN_ON(mmu_is_nested(vcpu)); - eptp = nested_ept_get_cr3(vcpu); - if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + wants_ad = nested_ept_ad_enabled(vcpu); + if (wants_ad && !enable_ept_ad_bits) return 1; kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - eptp & VMX_EPT_AD_ENABLE_BIT); + wants_ad); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -9574,23 +9634,26 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - u64 addr; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { - WARN_ON(1); - return -EINVAL; - } - maxphyaddr = cpuid_maxphyaddr(vcpu); - - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || - ((addr + PAGE_SIZE) >> maxphyaddr)) + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) return -EINVAL; return 0; @@ -10278,6 +10341,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10414,8 +10480,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return 1; } - vmcs12->launch_state = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -10433,6 +10497,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); u32 exit_qual; int ret; @@ -10457,6 +10522,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) { + nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + goto out; + } + if (vmcs12->launch_state == launch) { nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS @@ -10728,8 +10799,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10754,8 +10824,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -10792,6 +10860,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; @@ -11152,7 +11222,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e846f0cb83b..82a63c59f77b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -134,8 +134,6 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); -static bool __read_mostly backwards_tsc_observed = false; - #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -452,7 +450,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; + vcpu->arch.exception.nested_apf = + is_guest_mode(vcpu) && fault->async_page_fault; + if (vcpu->arch.exception.nested_apf) + vcpu->arch.apf.nested_apf_token = fault->address; + else + vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); @@ -594,8 +597,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & ~31ul) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & ~31ul) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) @@ -1719,7 +1722,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed + && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -2060,8 +2063,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) + /* Bits 3:5 are reserved, Should be zero */ + if (data & 0x38) return 1; vcpu->arch.apf.msr_val = data; @@ -2077,6 +2080,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } @@ -2661,6 +2665,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: + case KVM_CAP_HYPERV_SYNIC2: + case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2841,10 +2847,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -3384,10 +3390,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC2: + if (cap->args[0]) + return -EINVAL; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - return kvm_hv_activate_synic(vcpu); + return kvm_hv_activate_synic(vcpu, cap->cap == + KVM_CAP_HYPERV_SYNIC2); default: return -EINVAL; } @@ -4188,9 +4198,15 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; + /* + * TODO: userspace has to take care of races with VCPU_RUN, so + * kvm_gen_update_masterclock() can be cut down to locked + * pvclock_update_vm_gtod_copy(). + */ + kvm_gen_update_masterclock(kvm); now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - kvm_gen_update_masterclock(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } case KVM_GET_CLOCK: { @@ -6011,7 +6027,7 @@ static void kvm_set_mmio_spte_mask(void) mask &= ~1ull; #endif - kvm_mmu_set_mmio_spte_mask(mask); + kvm_mmu_set_mmio_spte_mask(mask, mask); } #ifdef CONFIG_X86_64 @@ -6347,10 +6363,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_update_dr7(vcpu); } - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); + kvm_x86_ops->queue_exception(vcpu); return 0; } @@ -6733,7 +6746,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6897,7 +6910,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7676,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) struct msr_data msr; struct kvm *kvm = vcpu->kvm; + kvm_hv_vcpu_postcreate(vcpu); + if (vcpu_load(vcpu)) return; msr.data = 0x0; @@ -7829,8 +7844,8 @@ int kvm_arch_hardware_enable(void) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.backwards_tsc_observed = true; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; @@ -8576,6 +8591,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } } @@ -8598,6 +8614,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } vcpu->arch.apf.halted = false; diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index cad12634d6bd..2eab7d0bfedd 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -6,7 +6,7 @@ __visible void *memcpy(void *to, const void *from, size_t n) { -#ifdef CONFIG_X86_USE_3DNOW +#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE) return __memcpy3d(to, from, n); #else return __memcpy(to, from, n); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 3b7c40a2e3e1..75d3776123cc 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -7,6 +7,7 @@ */ #include <linux/export.h> #include <linux/uaccess.h> +#include <linux/highmem.h> /* * Zero Userspace @@ -73,3 +74,136 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); return len; } + +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +/** + * clean_cache_range - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. + */ +static void clean_cache_range(void *addr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = addr + size; + void *p; + + for (p = (void *)((unsigned long)addr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +void arch_wb_cache_pmem(void *addr, size_t size) +{ + clean_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + unsigned long flushed, dest = (unsigned long) dst; + long rc = __copy_user_nocache(dst, src, size, 0); + + /* + * __copy_user_nocache() uses non-temporal stores for the bulk + * of the transfer, but we need to manually flush if the + * transfer is unaligned. A cached memory copy is used when + * destination or size is not naturally aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + */ + if (size < 8) { + if (!IS_ALIGNED(dest, 4) || size != 4) + clean_cache_range(dst, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + clean_cache_range(dst, 1); + } + + flushed = dest - (unsigned long) dst; + if (size > flushed && !IS_ALIGNED(size - flushed, 8)) + clean_cache_range(dst + size - 1, 1); + } + + return rc; +} + +void memcpy_flushcache(void *_dst, const void *_src, size_t size) +{ + unsigned long dest = (unsigned long) _dst; + unsigned long source = (unsigned long) _src; + + /* cache copy and flush to align dest */ + if (!IS_ALIGNED(dest, 8)) { + unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest); + + memcpy((void *) dest, (void *) source, len); + clean_cache_range((void *) dest, len); + dest += len; + source += len; + size -= len; + if (!size) + return; + } + + /* 4x8 movnti loop */ + while (size >= 32) { + asm("movq (%0), %%r8\n" + "movq 8(%0), %%r9\n" + "movq 16(%0), %%r10\n" + "movq 24(%0), %%r11\n" + "movnti %%r8, (%1)\n" + "movnti %%r9, 8(%1)\n" + "movnti %%r10, 16(%1)\n" + "movnti %%r11, 24(%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8", "r9", "r10", "r11"); + dest += 32; + source += 32; + size -= 32; + } + + /* 1x8 movnti loop */ + while (size >= 8) { + asm("movq (%0), %%r8\n" + "movnti %%r8, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 8; + source += 8; + size -= 8; + } + + /* 1x4 movnti loop */ + while (size >= 4) { + asm("movl (%0), %%r8d\n" + "movnti %%r8d, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 4; + source += 4; + size -= 4; + } + + /* cache copy for remaining bytes */ + if (size) { + memcpy((void *) dest, (void *) source, size); + clean_cache_range((void *) dest, size); + } +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + char *from = kmap_atomic(page); + + memcpy_flushcache(to, from + offset, len); + kunmap_atomic(from); +} +#endif diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile index 9b0c63b60302..1b2dac174321 100644 --- a/arch/x86/math-emu/Makefile +++ b/arch/x86/math-emu/Makefile @@ -5,8 +5,8 @@ #DEBUG = -DDEBUGGING DEBUG = PARANOID = -DPARANOID -EXTRA_CFLAGS := $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) -EXTRA_AFLAGS := $(PARANOID) +ccflags-y += $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) +asflags-y += $(PARANOID) # From 'C' language sources: C_OBJS =fpu_entry.o errors.o \ diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index afbc4d805d66..c9c320dccca1 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -157,7 +157,7 @@ extern u_char const data_sizes_16[32]; #define signbyte(a) (((u_char *)(a))[9]) #define getsign(a) (signbyte(a) & 0x80) -#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; } +#define setsign(a,b) { if ((b) != 0) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; } #define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \ else signbyte(b) &= 0x7f; } #define changesign(a) { signbyte(a) ^= 0x80; } diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index b77360fdbf4a..19b33b50adfa 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -168,7 +168,7 @@ static int compare(FPU_REG const *b, int tagb) /* This function requires that st(0) is not empty */ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) { - int f = 0, c; + int f, c; c = compare(loaded_data, loaded_tag); @@ -189,12 +189,12 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) case COMP_No_Comp: f = SW_C3 | SW_C2 | SW_C0; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x121); +#endif /* PARANOID */ f = SW_C3 | SW_C2 | SW_C0; break; -#endif /* PARANOID */ } setcc(f); if (c & COMP_Denormal) { @@ -205,7 +205,7 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) static int compare_st_st(int nr) { - int f = 0, c; + int f, c; FPU_REG *st_ptr; if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { @@ -235,12 +235,12 @@ static int compare_st_st(int nr) case COMP_No_Comp: f = SW_C3 | SW_C2 | SW_C0; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x122); +#endif /* PARANOID */ f = SW_C3 | SW_C2 | SW_C0; break; -#endif /* PARANOID */ } setcc(f); if (c & COMP_Denormal) { @@ -283,12 +283,12 @@ static int compare_i_st_st(int nr) case COMP_No_Comp: f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x122); +#endif /* PARANOID */ f = 0; break; -#endif /* PARANOID */ } FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; if (c & COMP_Denormal) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index adad702b39cd..2824607df108 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,15 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dae6a5e5ad4a..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 88215ac16b24..02c9d7553409 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -23,12 +23,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - /* - * end + 1 here is intentional. We check several shadow bytes in advance - * to slightly speed up fastpath. In some rare cases we could cross - * boundary of mapped shadow, so we just map some more here. - */ - return vmemmap_populate(start, end + 1, NUMA_NO_NODE); + return vmemmap_populate(start, end, NUMA_NO_NODE); } static void __init clear_pgds(unsigned long start, diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 797295e792b2..229d04a83f85 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -92,13 +92,18 @@ unsigned long arch_mmap_rnd(void) static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + /* * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_min = SIZE_128M; gap_max = (task_size / 6) * 5; if (gap < gap_min) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c8520b2c62d2..757b0bcdf712 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -150,6 +150,12 @@ void clflush_cache_range(void *vaddr, unsigned int size) } EXPORT_SYMBOL_GPL(clflush_cache_range); +void arch_invalidate_pmem(void *addr, size_t size) +{ + clflush_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); + static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9b78685b66e6..45979502f64b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -37,14 +37,14 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool boot_cpu_done; - -static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); -static void init_cache_modes(void); +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; void pat_disable(const char *reason) { - if (!__pat_enabled) + if (pat_disabled) return; if (boot_cpu_done) { @@ -52,10 +52,8 @@ void pat_disable(const char *reason) return; } - __pat_enabled = 0; + pat_disabled = true; pr_info("x86/PAT: %s\n", reason); - - init_cache_modes(); } static int __init nopat(char *str) @@ -67,7 +65,7 @@ early_param("nopat", nopat); bool pat_enabled(void) { - return !!__pat_enabled; + return pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -205,6 +203,8 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -225,6 +225,7 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; __init_cache_modes(pat); } @@ -242,10 +243,9 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -static void init_cache_modes(void) +void init_cache_modes(void) { u64 pat = 0; - static int init_cm_done; if (init_cm_done) return; @@ -287,8 +287,6 @@ static void init_cache_modes(void) } __init_cache_modes(pat); - - init_cm_done = 1; } /** @@ -306,10 +304,8 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { - init_cache_modes(); + if (pat_disabled) return; - } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index f2a7faf4706e..b33093f84528 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -19,9 +19,6 @@ */ #define SKBDATA %r10 #define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ -#define MAX_BPF_STACK (512 /* from filter.h */ + \ - 32 /* space for rbx,r13,r14,r15 */ + \ - 8 /* space for skb_copy_bits */) #define FUNC(name) \ .globl name; \ @@ -66,7 +63,7 @@ FUNC(sk_load_byte_positive_offset) /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ - lea -MAX_BPF_STACK + 32(%rbp), %rdx;\ + lea 32(%rbp), %rdx;\ FRAME_BEGIN; \ mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ @@ -83,14 +80,14 @@ FUNC(sk_load_byte_positive_offset) bpf_slow_path_word: bpf_slow_path_common(4) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%eax + mov 32(%rbp),%eax bswap %eax ret bpf_slow_path_half: bpf_slow_path_common(2) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%ax + mov 32(%rbp),%ax rol $8,%ax movzwl %ax,%eax ret @@ -98,7 +95,7 @@ bpf_slow_path_half: bpf_slow_path_byte: bpf_slow_path_common(1) js bpf_error - movzbl - MAX_BPF_STACK + 32(%rbp),%eax + movzbl 32(%rbp),%eax ret #define sk_negative_common(SIZE) \ @@ -148,9 +145,10 @@ FUNC(sk_load_byte_negative_offset) bpf_error: # force a return 0 from jit handler xor %eax,%eax - mov - MAX_BPF_STACK(%rbp),%rbx - mov - MAX_BPF_STACK + 8(%rbp),%r13 - mov - MAX_BPF_STACK + 16(%rbp),%r14 - mov - MAX_BPF_STACK + 24(%rbp),%r15 + mov (%rbp),%rbx + mov 8(%rbp),%r13 + mov 16(%rbp),%r14 + mov 24(%rbp),%r15 + add $40, %rbp leaveq ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f58939393eef..e1324f280e06 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -197,17 +197,16 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define STACKSIZE \ - (MAX_BPF_STACK + \ - 32 /* space for rbx, r13, r14, r15 */ + \ +#define AUX_STACK_SPACE \ + (32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 48 +#define PROLOGUE_SIZE 37 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog) +static void emit_prologue(u8 **pprog, u32 stack_depth) { u8 *prog = *pprog; int cnt = 0; @@ -215,13 +214,17 @@ static void emit_prologue(u8 **pprog) EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, STACKSIZE */ - EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); + /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ + EMIT3_off32(0x48, 0x81, 0xEC, + round_up(stack_depth, 8) + AUX_STACK_SPACE); + + /* sub rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); /* all classic BPF filters use R6(rbx) save it */ - /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); + /* mov qword ptr [rbp+0],rbx */ + EMIT4(0x48, 0x89, 0x5D, 0); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -231,12 +234,12 @@ static void emit_prologue(u8 **pprog) * than synthetic ones. Therefore not worth adding complexity. */ - /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); - /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); - /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); + /* mov qword ptr [rbp+8],r13 */ + EMIT4(0x4C, 0x89, 0x6D, 8); + /* mov qword ptr [rbp+16],r14 */ + EMIT4(0x4C, 0x89, 0x75, 16); + /* mov qword ptr [rbp+24],r15 */ + EMIT4(0x4C, 0x89, 0x7D, 24); /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls * we need to reset the counter to 0. It's done in two instructions, @@ -246,8 +249,8 @@ static void emit_prologue(u8 **pprog) /* xor eax, eax */ EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp-X], rax */ - EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; @@ -289,13 +292,13 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ #define OFFSET2 36 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ @@ -361,7 +364,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog); + emit_prologue(&prog, bpf_prog->aux->stack_depth); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); @@ -877,7 +880,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; @@ -1036,15 +1039,17 @@ common_load: seen_exit = true; /* update cleanup_addr */ ctx->cleanup_addr = proglen; - /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); - /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); - /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); - /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); - + /* mov rbx, qword ptr [rbp+0] */ + EMIT4(0x48, 0x8B, 0x5D, 0); + /* mov r13, qword ptr [rbp+8] */ + EMIT4(0x4C, 0x8B, 0x6D, 8); + /* mov r14, qword ptr [rbp+16] */ + EMIT4(0x4C, 0x8B, 0x75, 16); + /* mov r15, qword ptr [rbp+24] */ + EMIT4(0x4C, 0x8B, 0x7D, 24); + + /* add rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; @@ -1162,6 +1167,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; + prog->jited_len = proglen; } else { prog = orig_prog; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index cfd1a89fd04e..dbe2132b0ed4 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -24,7 +24,6 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | unsigned int pci_early_dump_regs; static int pci_bf_sort; -static int smbios_type_b1_flag; int pci_routeirq; int noioapicquirk; #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS @@ -197,34 +196,18 @@ static int __init set_bf_sort(const struct dmi_system_id *d) static void __init read_dmi_type_b1(const struct dmi_header *dm, void *private_data) { - u8 *d = (u8 *)dm + 4; + u8 *data = (u8 *)dm + 4; if (dm->type != 0xB1) return; - switch (((*(u32 *)d) >> 9) & 0x03) { - case 0x00: - printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n"); - break; - case 0x01: /* set pci=bfsort */ - smbios_type_b1_flag = 1; - break; - case 0x02: /* do not set pci=bfsort */ - smbios_type_b1_flag = 2; - break; - default: - break; - } + if ((((*(u32 *)data) >> 9) & 0x03) == 0x01) + set_bf_sort((const struct dmi_system_id *)private_data); } static int __init find_sort_method(const struct dmi_system_id *d) { - dmi_walk(read_dmi_type_b1, NULL); - - if (smbios_type_b1_flag == 1) { - set_bf_sort(d); - return 0; - } - return -1; + dmi_walk(read_dmi_type_b1, (void *)d); + return 0; } /* diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6d52b94f4bb9..11e407489db0 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -571,3 +571,50 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); + +/* + * Device [1022:7808] + * 23. USB Wake on Connect/Disconnect with Low Speed Devices + * https://support.amd.com/TechDocs/46837.pdf + * Appendix A2 + * https://support.amd.com/TechDocs/42413.pdf + */ +static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) +{ + dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); + dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + >> PCI_PM_CAP_PME_SHIFT); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); + +/* + * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff] + * + * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to + * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used + * for soft poweroff and suspend-to-RAM. + * + * As far as we know, this is related to the address space, not to the Root + * Port itself. Attaching the quirk to the Root Port is a convenience, but + * it could probably also be a standalone DMI quirk. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=103211 + */ +static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *res; + + if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") && + !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) || + pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0)) + return; + + res = request_mem_region(0x7fa00000, 0x200000, + "MacBook Pro poweroff workaround"); + if (res) + dev_info(dev, "claimed %s %pR\n", res->name, res); + else + dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index c1bdb9edcae7..76595408ff53 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -46,7 +46,7 @@ static inline void set_bios_x(void) pcibios_enabled = 1; set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); + printk(KERN_INFO "PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); } /* diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index ec008e800b45..53d600217973 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -26,6 +26,7 @@ #include <linux/pci_ids.h> #include <linux/export.h> #include <linux/list.h> +#include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) extern int swiotlb_late_init_with_default_size(size_t default_size); @@ -191,7 +192,7 @@ static const struct dma_map_ops sta2x11_dma_ops = { .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .mapping_error = swiotlb_dma_mapping_error, - .dma_supported = NULL, /* FIXME: we should use this instead! */ + .dma_supported = x86_dma_supported, }; /* At setup time, we use our own ops if the device is a ConneXt one */ diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c index 6e075afa7877..58337b2bc682 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c @@ -38,8 +38,10 @@ static void __init *max7315_platform_data(void *info) */ strcpy(i2c_info->type, "max7315"); if (nr++) { - sprintf(base_pin_name, "max7315_%d_base", nr); - sprintf(intr_pin_name, "max7315_%d_int", nr); + snprintf(base_pin_name, sizeof(base_pin_name), + "max7315_%d_base", nr); + snprintf(intr_pin_name, sizeof(intr_pin_name), + "max7315_%d_int", nr); } else { strcpy(base_pin_name, "max7315_base"); strcpy(intr_pin_name, "max7315_int"); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 2983faab5b18..3e4bdb442fbc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -40,7 +40,6 @@ static int timeout_base_ns[] = { static int timeout_us; static bool nobau = true; static int nobau_perm; -static cycles_t congested_cycles; /* tunables: */ static int max_concurr = MAX_BAU_CONCURRENT; @@ -588,31 +587,11 @@ static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc } /* - * Return whether the status of the descriptor that is normally used for this - * cpu (the one indexed by its hub-relative cpu number) is busy. - * The status of the original 32 descriptors is always reflected in the 64 - * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. - * The bit provided by the activation_status_2 register is irrelevant to - * the status if it is only being tested for busy or not busy. - */ -int normal_busy(struct bau_control *bcp) -{ - int cpu = bcp->uvhub_cpu; - int mmr_offset; - int right_shift; - - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - return (((((read_lmmr(mmr_offset) >> right_shift) & - UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); -} - -/* * Entered when a bau descriptor has gone into a permanent busy wait because * of a hardware bug. * Workaround the bug. */ -int handle_uv2_busy(struct bau_control *bcp) +static int handle_uv2_busy(struct bau_control *bcp) { struct ptc_stats *stat = bcp->statp; @@ -849,10 +828,10 @@ static void record_send_stats(cycles_t time1, cycles_t time2, if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { bcp->period_requests++; bcp->period_time += elapsed; - if ((elapsed > congested_cycles) && + if ((elapsed > usec_2_cycles(bcp->cong_response_us)) && (bcp->period_requests > bcp->cong_reps) && ((bcp->period_time / bcp->period_requests) > - congested_cycles)) { + usec_2_cycles(bcp->cong_response_us))) { stat->s_congested++; disable_for_period(bcp, stat); } @@ -917,8 +896,9 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, - struct bau_desc *bau_desc) +static int uv_flush_send_and_wait(struct cpumask *flush_mask, + struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -1212,8 +1192,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * Search the message queue for any 'other' unprocessed message with the * same software acknowledge resource bit vector as the 'msg' message. */ -struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp) +static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; unsigned char swack_vec = msg->swack_vec; @@ -2241,14 +2221,17 @@ static int __init uv_bau_init(void) else if (is_uv1_hub()) ops = uv1_bau_ops; + nuvhubs = uv_num_possible_blades(); + if (nuvhubs < 2) { + pr_crit("UV: BAU disabled - insufficient hub count\n"); + goto err_bau_disable; + } + for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } - nuvhubs = uv_num_possible_blades(); - congested_cycles = usec_2_cycles(congested_respns_us); - uv_base_pnode = 0x7fffffff; for (uvhub = 0; uvhub < nuvhubs; uvhub++) { cpus = uv_blade_nr_possible_cpus(uvhub); @@ -2261,9 +2244,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - set_bau_off(); - nobau_perm = 1; - return 0; + pr_crit("UV: BAU disabled - per CPU init failed\n"); + goto err_bau_disable; } vector = UV_BAU_MESSAGE; @@ -2289,6 +2271,16 @@ static int __init uv_bau_init(void) } return 0; + +err_bau_disable: + + for_each_possible_cpu(cur_cpu) + free_cpumask_var(per_cpu(uv_flush_tlb_mask, cur_cpu)); + + set_bau_off(); + nobau_perm = 1; + + return -EINVAL; } core_initcall(uv_bau_init); fs_initcall(uv_ptc_init); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index e3e62c8a8e70..f2598d81cd55 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -147,7 +147,7 @@ static int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + @@ -293,8 +293,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; - rdr->jump_address = (unsigned long)&restore_registers; - rdr->jump_address_phys = __pa_symbol(&restore_registers); + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 00f54a91bb4b..28775f55bde2 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -26,6 +26,7 @@ int save_i387_registers(int pid, unsigned long *fp_regs) int save_fp_registers(int pid, unsigned long *fp_regs) { +#ifdef PTRACE_GETREGSET struct iovec iov; if (have_xstate_support) { @@ -34,9 +35,9 @@ int save_fp_registers(int pid, unsigned long *fp_regs) if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; - } else { + } else +#endif return save_i387_registers(pid, fp_regs); - } } int restore_i387_registers(int pid, unsigned long *fp_regs) @@ -48,6 +49,7 @@ int restore_i387_registers(int pid, unsigned long *fp_regs) int restore_fp_registers(int pid, unsigned long *fp_regs) { +#ifdef PTRACE_SETREGSET struct iovec iov; if (have_xstate_support) { @@ -56,9 +58,9 @@ int restore_fp_registers(int pid, unsigned long *fp_regs) if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; - } else { + } else +#endif return restore_i387_registers(pid, fp_regs); - } } #ifdef __i386__ @@ -122,6 +124,7 @@ int put_fp_registers(int pid, unsigned long *regs) void arch_init_registers(int pid) { +#ifdef PTRACE_GETREGSET struct _xstate fp_regs; struct iovec iov; @@ -129,6 +132,7 @@ void arch_init_registers(int pid) iov.iov_len = sizeof(struct _xstate); if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0) have_xstate_support = 1; +#endif } #endif diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S index b766792c9933..39053192918d 100644 --- a/arch/x86/um/setjmp_32.S +++ b/arch/x86/um/setjmp_32.S @@ -16,9 +16,9 @@ .text .align 4 - .globl setjmp - .type setjmp, @function -setjmp: + .globl kernel_setjmp + .type kernel_setjmp, @function +kernel_setjmp: #ifdef _REGPARM movl %eax,%edx #else @@ -35,13 +35,13 @@ setjmp: movl %ecx,20(%edx) # Return address ret - .size setjmp,.-setjmp + .size kernel_setjmp,.-kernel_setjmp .text .align 4 - .globl longjmp - .type longjmp, @function -longjmp: + .globl kernel_longjmp + .type kernel_longjmp, @function +kernel_longjmp: #ifdef _REGPARM xchgl %eax,%edx #else @@ -55,4 +55,4 @@ longjmp: movl 16(%edx),%edi jmp *20(%edx) - .size longjmp,.-longjmp + .size kernel_longjmp,.-kernel_longjmp diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S index 45f547b4043e..c56942e1a38c 100644 --- a/arch/x86/um/setjmp_64.S +++ b/arch/x86/um/setjmp_64.S @@ -18,9 +18,9 @@ .text .align 4 - .globl setjmp - .type setjmp, @function -setjmp: + .globl kernel_setjmp + .type kernel_setjmp, @function +kernel_setjmp: pop %rsi # Return address, and adjust the stack xorl %eax,%eax # Return value movq %rbx,(%rdi) @@ -34,13 +34,13 @@ setjmp: movq %rsi,56(%rdi) # Return address ret - .size setjmp,.-setjmp + .size kernel_setjmp,.-kernel_setjmp .text .align 4 - .globl longjmp - .type longjmp, @function -longjmp: + .globl kernel_longjmp + .type kernel_longjmp, @function +kernel_longjmp: movl %esi,%eax # Return value (int) movq (%rdi),%rbx movq 8(%rdi),%rsp @@ -51,4 +51,4 @@ longjmp: movq 48(%rdi),%r15 jmp *56(%rdi) - .size longjmp,.-longjmp + .size kernel_longjmp,.-kernel_longjmp diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index cb3c22370cf5..ae4cd58c0c7a 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -5,7 +5,7 @@ #include <sys/mman.h> #include <sys/user.h> #define __FRAME_OFFSETS -#include <asm/ptrace.h> +#include <linux/ptrace.h> #include <asm/types.h> #ifdef __i386__ @@ -50,7 +50,11 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else +#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET) DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long)); +#else + DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); +#endif DEFINE_LONGS(HOST_BX, RBX); DEFINE_LONGS(HOST_CX, RCX); DEFINE_LONGS(HOST_DI, RDI); diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index d72dec406ccb..329406224330 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -53,7 +53,7 @@ CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage targets += vdso-syms.lds -obj-$(VDSO64-y) += vdso-syms.lds +extra-$(VDSO64-y) += vdso-syms.lds # # Match symbols in the DSO that look like VDSO*; produce a file of constants. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a5ffcbb20cc0..0e7ef69e8531 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -106,15 +106,83 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static void clamp_max_cpus(void) +static int xen_vcpu_setup_restore(int cpu) { -#ifdef CONFIG_SMP - if (setup_max_cpus > MAX_VIRT_CPUS) - setup_max_cpus = MAX_VIRT_CPUS; -#endif + int rc = 0; + + /* Any per_cpu(xen_vcpu) is stale, so reset it */ + xen_vcpu_info_reset(cpu); + + /* + * For PVH and PVHVM, setup online VCPUs only. The rest will + * be handled by hotplug. + */ + if (xen_pv_domain() || + (xen_hvm_domain() && cpu_online(cpu))) { + rc = xen_vcpu_setup(cpu); + } + + return rc; +} + +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu, rc; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up; + + if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID) + continue; + + /* Only Xen 4.5 and higher support this. */ + is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL) > 0; + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_runstate_info(cpu); + + rc = xen_vcpu_setup_restore(cpu); + if (rc) + pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " + "System will hang.\n", cpu, rc); + /* + * In case xen_vcpu_setup_restore() fails, do not bring up the + * VCPU. This helps us avoid the resulting OOPS when the VCPU + * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) + * Note that this does not improve the situation much -- now the + * VM hangs instead of OOPSing -- with the VCPUs that did not + * fail, spinning in stop_machine(), waiting for the failed + * VCPUs to come up. + */ + if (other_cpu && is_up && (rc == 0) && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } } -void xen_vcpu_setup(int cpu) +void xen_vcpu_info_reset(int cpu) +{ + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } else { + /* Set to NULL so that if somebody accesses it we get an OOPS */ + per_cpu(xen_vcpu, cpu) = NULL; + } +} + +int xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -123,11 +191,11 @@ void xen_vcpu_setup(int cpu) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* - * This path is called twice on PVHVM - first during bootup via - * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being - * hotplugged: cpu_up -> xen_hvm_cpu_notify. - * As we can only do the VCPUOP_register_vcpu_info once lets - * not over-write its result. + * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu) + * and at restore (xen_vcpu_restore). Also called for hotplugged + * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm). + * However, the hypercall can only be done once (see below) so if a VCPU + * is offlined and comes back online then let's not redo the hypercall. * * For PV it is called during restore (xen_vcpu_restore) and bootup * (xen_setup_vcpu_info_placement). The hotplug mechanism does not @@ -135,42 +203,44 @@ void xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return; + return 0; } - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!xen_have_vcpu_info_placement) { - if (cpu >= MAX_VIRT_CPUS) - clamp_max_cpus(); - return; + if (xen_have_vcpu_info_placement) { + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + /* + * Check to see if the hypervisor will put the vcpu_info + * structure where we want it, which allows direct access via + * a percpu-variable. + * N.B. This hypercall can _only_ be called once per CPU. + * Subsequent calls will error out with -EINVAL. This is due to + * the fact that hypervisor has no unregister variant and this + * hypercall does not allow to over-write info.mfn and + * info.offset. + */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, + xen_vcpu_nr(cpu), &info); + + if (err) { + pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", + cpu, err); + xen_have_vcpu_info_placement = 0; + } else { + /* + * This cpu is using the registered vcpu info, even if + * later ones fail to. + */ + per_cpu(xen_vcpu, cpu) = vcpup; + } } - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - /* Check to see if the hypervisor will put the vcpu_info - structure where we want it, which allows direct access via - a percpu-variable. - N.B. This hypercall can _only_ be called once per CPU. Subsequent - calls will error out with -EINVAL. This is due to the fact that - hypervisor has no unregister variant and this hypercall does not - allow to over-write info.mfn and info.offset. - */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), - &info); + if (!xen_have_vcpu_info_placement) + xen_vcpu_info_reset(cpu); - if (err) { - printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - xen_have_vcpu_info_placement = 0; - clamp_max_cpus(); - } else { - /* This cpu is using the registered vcpu info, even if - later ones fail to. */ - per_cpu(xen_vcpu, cpu) = vcpup; - } + return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } void xen_reboot(int reason) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index a6d014f47e52..87d791356ea9 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,5 +1,6 @@ #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <xen/features.h> #include <xen/events.h> @@ -10,9 +11,11 @@ #include <asm/reboot.h> #include <asm/setup.h> #include <asm/hypervisor.h> +#include <asm/e820/api.h> #include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> #include "xen-ops.h" #include "mmu.h" @@ -20,37 +23,34 @@ void __ref xen_hvm_init_shared_info(void) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page; + u64 pa; + + if (HYPERVISOR_shared_info == &xen_dummy_shared_info) { + /* + * Search for a free page starting at 4kB physical address. + * Low memory is preferred to avoid an EPT large page split up + * by the mapping. + * Starting below X86_RESERVE_LOW (usually 64kB) is fine as + * the BIOS used for HVM guests is well behaved and won't + * clobber memory other than the first 4kB. + */ + for (pa = PAGE_SIZE; + !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) || + memblock_is_reserved(pa); + pa += PAGE_SIZE) + ; + + memblock_reserve(pa, PAGE_SIZE); + HYPERVISOR_shared_info = __va(pa); + } - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info); if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info - * page, we use it in the event channel upcall and in some pvclock - * related functions. We don't need the vcpu_info placement - * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { - /* Leave it to be NULL. */ - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) - continue; - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - } } static void __init init_hvm_pv_info(void) @@ -106,7 +106,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs) static int xen_cpu_up_prepare_hvm(unsigned int cpu) { - int rc; + int rc = 0; /* * This can happen if CPU was offlined earlier and @@ -121,7 +121,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + rc = xen_vcpu_setup(cpu); + if (rc) + return rc; if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); @@ -130,9 +132,8 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) if (rc) { WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", cpu, rc); - return rc; } - return 0; + return rc; } static int xen_cpu_dead_hvm(unsigned int cpu) @@ -154,6 +155,13 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + /* + * xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. + */ + xen_vcpu_info_reset(0); + xen_panic_handler_init(); if (xen_feature(XENFEAT_hvm_callback_vector)) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f33eef4ebd12..811e4ddb3f37 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -89,8 +89,6 @@ void *xen_initial_gdt; -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); @@ -107,35 +105,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (xen_have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - static void __init xen_banner(void) { unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); @@ -960,30 +929,43 @@ void xen_setup_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - xen_setup_mfn_list_list(); - /* - * Now that shared info is set up we can start using routines that - * point to pvclock area. - */ - if (system_state == SYSTEM_BOOTING) + if (system_state == SYSTEM_BOOTING) { +#ifndef CONFIG_SMP + /* + * In UP this is as good a place as any to set up shared info. + * Limit this to boot only, at restore vcpu setup is done via + * xen_vcpu_restore(). + */ + xen_setup_vcpu_info_placement(); +#endif + /* + * Now that shared info is set up we can start using routines + * that point to pvclock area. + */ xen_init_time_ops(); + } } /* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) +void __ref xen_setup_vcpu_info_placement(void) { int cpu; for_each_possible_cpu(cpu) { /* Set up direct vCPU id mapping for PV guests. */ per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + + /* + * xen_vcpu_setup(cpu) can fail -- in which case it + * falls back to the shared_info version for cpus + * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS. + * + * xen_cpu_up_prepare_pv() handles the rest by failing + * them in hotplug. + */ + (void) xen_vcpu_setup(cpu); } /* @@ -1332,9 +1314,17 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because start_kernel needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); @@ -1431,9 +1421,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif xen_raw_console_write("about to get started...\n"); - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - + /* We need this for printk timestamps */ xen_setup_runstate_info(0); xen_efi_init(); @@ -1451,6 +1439,9 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu) { int rc; + if (per_cpu(xen_vcpu, cpu) == NULL) + return -ENODEV; + xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..cab28cf2cffb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 42b08f8fc2ca..37c6056a7bba 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,20 +18,6 @@ int xen_swiotlb __read_mostly; -static const struct dma_map_ops xen_swiotlb_dma_ops = { - .alloc = xen_swiotlb_alloc_coherent, - .free = xen_swiotlb_free_coherent, - .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, - .sync_single_for_device = xen_swiotlb_sync_single_for_device, - .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, - .map_sg = xen_swiotlb_map_sg_attrs, - .unmap_sg = xen_swiotlb_unmap_sg_attrs, - .map_page = xen_swiotlb_map_page, - .unmap_page = xen_swiotlb_unmap_page, - .dma_supported = xen_swiotlb_dma_supported, -}; - /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary * diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a5bf7c451435..c81046323ebc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -499,7 +499,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, void __init xen_remap_memory(void) { unsigned long buf = (unsigned long)&xen_remap_buf; - unsigned long mfn_save, mfn, pfn; + unsigned long mfn_save, pfn; unsigned long remapped = 0; unsigned int i; unsigned long pfn_s = ~0UL; @@ -515,8 +515,7 @@ void __init xen_remap_memory(void) pfn = xen_remap_buf.target_pfn; for (i = 0; i < xen_remap_buf.size; i++) { - mfn = xen_remap_buf.mfns[i]; - xen_update_mem_tables(pfn, mfn); + xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]); remapped++; pfn++; } @@ -530,8 +529,6 @@ void __init xen_remap_memory(void) pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } - - mfn = xen_remap_mfn; xen_remap_mfn = xen_remap_buf.next_area_mfn; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 82ac611f2fc1..e7f02eb73727 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,4 +1,5 @@ #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/cpumask.h> #include <linux/percpu.h> @@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } +void __init xen_smp_cpus_done(unsigned int max_cpus) +{ + int cpu, rc, count = 0; + + if (xen_hvm_domain()) + native_smp_cpus_done(max_cpus); + + if (xen_have_vcpu_info_placement) + return; + + for_each_online_cpu(cpu) { + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) + continue; + + rc = cpu_down(cpu); + + if (rc == 0) { + /* + * Reset vcpu_info so this cpu cannot be onlined again. + */ + xen_vcpu_info_reset(cpu); + count++; + } else { + pr_warn("%s: failed to bring CPU %d down, error %d\n", + __func__, cpu, rc); + } + } + WARN(count, "%s: brought %d CPUs offline\n", __func__, count); +} + void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 8ebb6acca64a..87d3c76cba37 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + void xen_smp_send_reschedule(int cpu); void xen_smp_send_call_function_ipi(const struct cpumask *mask); void xen_smp_send_call_function_single_ipi(int cpu); diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f18561bbf5c9..fd60abedf658 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -12,7 +12,8 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); /* - * Setup vcpu_info for boot CPU. + * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info + * in xen_cpu_up_prepare_hvm(). */ xen_vcpu_setup(0); @@ -27,10 +28,20 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) { + int cpu; + native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); xen_init_lock_cpu(0); + + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + + /* Set default vcpu_id to make sure that we don't use cpu-0's */ + per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -60,4 +71,5 @@ void __init xen_hvm_smp_init(void) smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index aae32535f4ec..51471408fdd1 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -19,6 +19,7 @@ #include <linux/irq_work.h> #include <linux/tick.h> #include <linux/nmi.h> +#include <linux/cpuhotplug.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -371,10 +372,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -static void xen_pv_smp_cpus_done(unsigned int max_cpus) -{ -} - #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -417,7 +414,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ */ tick_nohz_idle_enter(); - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -469,7 +466,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, - .smp_cpus_done = xen_pv_smp_cpus_done, + .smp_cpus_done = xen_smp_cpus_done, .cpu_up = xen_pv_cpu_up, .cpu_die = xen_pv_cpu_die, diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 01afcadde50a..484999416d8b 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -8,15 +8,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) { - int cpu; - - if (!suspend_cancelled) + if (!suspend_cancelled) { xen_hvm_init_shared_info(); + xen_vcpu_restore(); + } xen_callback_vector(); xen_unplug_emulated_devices(); - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { - for_each_online_cpu(cpu) { - xen_setup_runstate_info(cpu); - } - } } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1895a8e85c1..1ecb05db3632 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -309,7 +309,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) void xen_teardown_timer(int cpu) { struct clock_event_device *evt; - BUG_ON(cpu == 0); evt = &per_cpu(xen_clock_events, cpu).evt; if (evt->irq >= 0) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a440a42c618..0d5004477db6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -78,7 +78,8 @@ bool xen_vcpu_stolen(int vcpu); extern int xen_have_vcpu_info_placement; -void xen_vcpu_setup(int cpu); +int xen_vcpu_setup(int cpu); +void xen_vcpu_info_reset(int cpu); void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP |