diff options
Diffstat (limited to 'arch/x86')
40 files changed, 351 insertions, 460 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0312f8947ce..c5ff296bc5d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT + select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK @@ -179,7 +180,8 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_RCU_TABLE_FREE + select HAVE_RCU_TABLE_FREE if PARAVIRT + select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7e3c07d6ad42..94859241bc3e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -219,7 +219,7 @@ sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA2 KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) -LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) # # The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to @@ -227,7 +227,7 @@ LDFLAGS := -m elf_$(UTS_MACHINE) # by the linker. # ifdef CONFIG_X86_64 -LDFLAGS += $(call ld-option, -z max-page-size=0x200000) +KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000) endif # Speed up the build diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 5296f8c9e7f0..91085a08de6c 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -4,7 +4,7 @@ core-y += arch/x86/crypto/ ifeq ($(CONFIG_X86_32),y) START := 0x8048000 -LDFLAGS += -m elf_i386 +KBUILD_LDFLAGS += -m elf_i386 ELF_ARCH := i386 ELF_FORMAT := elf32-i386 CHECKFLAGS += -D__i386__ @@ -43,7 +43,7 @@ KBUILD_CFLAGS += -fno-builtin -m64 CHECKFLAGS += -m64 -D__x86_64__ KBUILD_AFLAGS += -m64 -LDFLAGS += -m elf_x86_64 +KBUILD_LDFLAGS += -m elf_x86_64 KBUILD_CPPFLAGS += -m64 ELF_ARCH := i386:x86-64 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 169c2feda14a..28764dacf018 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -42,16 +42,16 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n -LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. ifeq ($(CONFIG_X86_32),y) -LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) +KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) else # To build 64-bit compressed kernel as PIE, we disable relocation # overflow check to avoid relocation overflow error with a new linker # command-line option, -z noreloc-overflow. -LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ +KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ && echo "-z noreloc-overflow -pie --no-dynamic-linker") endif LDFLAGS_vmlinux := -T diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 302517929932..d1e19f358b6e 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -23,11 +23,8 @@ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL * which is meaningless and will cause compiling error in some cases. - * So do not include linux/export.h and define EXPORT_SYMBOL(sym) - * as empty. */ -#define _LINUX_EXPORT_H -#define EXPORT_SYMBOL(sym) +#define __DISABLE_EXPORTS #include "misc.h" #include "error.h" diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 9bd139569b41..cb2deb61c5d9 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -223,34 +223,34 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff pcmpeqd TWOONE(%rip), \TMP2 pand POLY(%rip), \TMP2 pxor \TMP2, \TMP3 - movdqa \TMP3, HashKey(%arg2) + movdqu \TMP3, HashKey(%arg2) movdqa \TMP3, \TMP5 pshufd $78, \TMP3, \TMP1 pxor \TMP3, \TMP1 - movdqa \TMP1, HashKey_k(%arg2) + movdqu \TMP1, HashKey_k(%arg2) GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^2<<1 (mod poly) - movdqa \TMP5, HashKey_2(%arg2) + movdqu \TMP5, HashKey_2(%arg2) # HashKey_2 = HashKey^2<<1 (mod poly) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_2_k(%arg2) + movdqu \TMP1, HashKey_2_k(%arg2) GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_3(%arg2) + movdqu \TMP5, HashKey_3(%arg2) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_3_k(%arg2) + movdqu \TMP1, HashKey_3_k(%arg2) GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_4(%arg2) + movdqu \TMP5, HashKey_4(%arg2) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_4_k(%arg2) + movdqu \TMP1, HashKey_4_k(%arg2) .endm # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. @@ -271,7 +271,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, - movdqa HashKey(%arg2), %xmm13 + movdqu HashKey(%arg2), %xmm13 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ %xmm4, %xmm5, %xmm6 @@ -997,7 +997,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT - movdqa HashKey_4(%arg2), \TMP5 + movdqu HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT @@ -1016,7 +1016,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 - movdqa HashKey_4_k(%arg2), \TMP5 + movdqu HashKey_4_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 @@ -1031,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 - movdqa HashKey_3(%arg2), \TMP5 + movdqu HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 @@ -1044,7 +1044,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_3_k(%arg2), \TMP5 + movdqu HashKey_3_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 @@ -1058,7 +1058,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 - movdqa HashKey_2(%arg2), \TMP5 + movdqu HashKey_2(%arg2), \TMP5 # Multiply TMP5 * HashKey using karatsuba @@ -1074,7 +1074,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_2_k(%arg2), \TMP5 + movdqu HashKey_2_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 @@ -1092,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 - movdqa HashKey(%arg2), \TMP5 + movdqu HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 @@ -1121,7 +1121,7 @@ aes_loop_par_enc_done\@: AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 - movdqa HashKey_k(%arg2), \TMP5 + movdqu HashKey_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK @@ -1205,7 +1205,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT - movdqa HashKey_4(%arg2), \TMP5 + movdqu HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT @@ -1224,7 +1224,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 - movdqa HashKey_4_k(%arg2), \TMP5 + movdqu HashKey_4_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 @@ -1239,7 +1239,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 - movdqa HashKey_3(%arg2), \TMP5 + movdqu HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 @@ -1252,7 +1252,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_3_k(%arg2), \TMP5 + movdqu HashKey_3_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 @@ -1266,7 +1266,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 - movdqa HashKey_2(%arg2), \TMP5 + movdqu HashKey_2(%arg2), \TMP5 # Multiply TMP5 * HashKey using karatsuba @@ -1282,7 +1282,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_2_k(%arg2), \TMP5 + movdqu HashKey_2_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 @@ -1300,7 +1300,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 - movdqa HashKey(%arg2), \TMP5 + movdqu HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 @@ -1329,7 +1329,7 @@ aes_loop_par_dec_done\@: AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 - movdqa HashKey_k(%arg2), \TMP5 + movdqu HashKey_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK @@ -1405,10 +1405,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM1, \TMP6 pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 - movdqa HashKey_4(%arg2), \TMP5 + movdqu HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqa HashKey_4_k(%arg2), \TMP4 + movdqu HashKey_4_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 @@ -1418,10 +1418,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM2, \TMP1 pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 - movdqa HashKey_3(%arg2), \TMP5 + movdqu HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqa HashKey_3_k(%arg2), \TMP4 + movdqu HashKey_3_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst @@ -1433,10 +1433,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM3, \TMP1 pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 - movdqa HashKey_2(%arg2), \TMP5 + movdqu HashKey_2(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqa HashKey_2_k(%arg2), \TMP4 + movdqu HashKey_2_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst @@ -1446,10 +1446,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM4, \TMP1 pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 - movdqa HashKey(%arg2), \TMP5 + movdqu HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqa HashKey_k(%arg2), \TMP4 + movdqu HashKey_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 9f695f517747..fa3f439f0a92 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,9 +68,9 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -132,11 +132,13 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 1147e1fed7ff..ef5f29f913d7 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -9,6 +9,7 @@ #include <asm/mshyperv.h> #include <asm/msr.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> #define CREATE_TRACE_POINTS #include <asm/trace/hyperv.h> @@ -231,4 +232,5 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index de690c2d2e33..a0ab9ab61c75 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h generic-y += dma-contiguous.h generic-y += early_ioremap.h +generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h deleted file mode 100644 index 2a51d66689c5..000000000000 --- a/arch/x86/include/asm/export.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_64BIT -#define KSYM_ALIGN 16 -#endif -#include <asm-generic/export.h> diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 023b4a9fc846..5f26962eff42 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +enum { + IRQ_REMAP_XAPIC_MODE, + IRQ_REMAP_X2APIC_MODE, +}; + struct vcpu_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8c7b3e5a2d01..3a17107594c8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -148,6 +148,7 @@ enum mce_notifier_prios { MCE_PRIO_LOWEST = 0, }; +struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d49bbf4bb5c8..e375d4266b53 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -309,6 +309,11 @@ static inline void flush_tlb_others(const struct cpumask *cpumask, PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); } +static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table); +} + static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 180bc0bff0fb..4b75acc23b30 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -54,6 +54,7 @@ struct desc_struct; struct task_struct; struct cpumask; struct flush_tlb_info; +struct mmu_gather; /* * Wrapper type for pointers to code which uses the non-standard @@ -222,6 +223,8 @@ struct pv_mmu_ops { void (*flush_tlb_others)(const struct cpumask *cpus, const struct flush_tlb_info *info); + void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); + /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 682286aca881..c24297268ebc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,9 +181,9 @@ extern const struct seq_operations cpuinfo_op; extern void cpu_detect(struct cpuinfo_x86 *c); -static inline unsigned long l1tf_pfn_limit(void) +static inline unsigned long long l1tf_pfn_limit(void) { - return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; + return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT); } extern void early_cpu_init(void); diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 34cffcef7375..07a25753e85c 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -89,4 +89,46 @@ extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); +#ifdef CONFIG_X86_64 +static inline int set_mce_nospec(unsigned long pfn) +{ + unsigned long decoy_addr; + int rc; + + /* + * Mark the linear address as UC to make sure we don't log more + * errors because of speculative access to the page. + * We would like to just call: + * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_uc() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + rc = set_memory_uc(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; +} +#define set_mce_nospec set_mce_nospec + +/* Restore full speculative operation to the pfn. */ +static inline int clear_mce_nospec(unsigned long pfn) +{ + return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); +} +#define clear_mce_nospec clear_mce_nospec +#else +/* + * Few people would run a 32-bit kernel on a machine that supports + * recoverable errors because they have too much memory to boot 32-bit. + */ +#endif + #endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 511bf5fae8b8..29c9da6c62fc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif +static inline bool tlb_defer_switch_to_init_mm(void) +{ + /* + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we try to defer it in the hopes that we can + * avoid it entirely. The latter approach runs the risk of + * receiving otherwise unnecessary IPIs. + * + * This choice is just a heuristic. The tlb code can handle this + * function returning true or false regardless of whether we have + * PCID. + */ + return !static_cpu_has(X86_FEATURE_PCID); +} + struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -536,11 +552,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, info) \ native_flush_tlb_others(mask, info) -#endif -extern void tlb_flush_remove_tables(struct mm_struct *mm); -extern void tlb_flush_remove_tables_local(void *arg); - -#define HAVE_TLB_FLUSH_REMOVE_TABLES +#define paravirt_tlb_remove_table(tlb, page) \ + tlb_remove_page(tlb, (void *)(page)) +#endif #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 95f9107449bf..9527ba5d62da 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -74,6 +74,7 @@ #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -213,6 +214,8 @@ enum vmcs_field { VMWRITE_BITMAP_HIGH = 0x00002029, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, + ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 6b2f90a0b149..ef05bea7010d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -197,17 +197,6 @@ extern struct { char _entry[32]; } hypercall_page[]; (type)__res; \ }) -#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ -({ \ - __HYPERCALL_DECLS; \ - __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \ - asm volatile (__HYPERCALL \ - : __HYPERCALL_5PARAM \ - : __HYPERCALL_ENTRY(name) \ - : __HYPERCALL_CLOBBER5); \ - (type)__res; \ -}) - static inline long xen_single_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -267,47 +256,12 @@ HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) } static inline int -HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) -{ - return _hypercall2(int, stack_switch, ss, esp); -} - -#ifdef CONFIG_X86_32 -static inline int -HYPERVISOR_set_callbacks(unsigned long event_selector, - unsigned long event_address, - unsigned long failsafe_selector, - unsigned long failsafe_address) -{ - return _hypercall4(int, set_callbacks, - event_selector, event_address, - failsafe_selector, failsafe_address); -} -#else /* CONFIG_X86_64 */ -static inline int -HYPERVISOR_set_callbacks(unsigned long event_address, - unsigned long failsafe_address, - unsigned long syscall_address) -{ - return _hypercall3(int, set_callbacks, - event_address, failsafe_address, - syscall_address); -} -#endif /* CONFIG_X86_{32,64} */ - -static inline int HYPERVISOR_callback_op(int cmd, void *arg) { return _hypercall2(int, callback_op, cmd, arg); } static inline int -HYPERVISOR_fpu_taskswitch(int set) -{ - return _hypercall1(int, fpu_taskswitch, set); -} - -static inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); @@ -419,19 +373,6 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) } static inline int -HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, - unsigned long flags, domid_t domid) -{ - if (sizeof(new_val) == sizeof(long)) - return _hypercall4(int, update_va_mapping_otherdomain, va, - new_val.pte, flags, domid); - else - return _hypercall5(int, update_va_mapping_otherdomain, va, - new_val.pte, new_val.pte >> 32, - flags, domid); -} - -static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) { return _hypercall2(int, vm_assist, cmd, type); @@ -465,12 +406,6 @@ HYPERVISOR_suspend(unsigned long start_info_mfn) return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } -static inline int -HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) -{ - return _hypercall2(int, nmi_op, op, arg); -} - static inline unsigned long __must_check HYPERVISOR_hvm_op(int op, void *arg) { @@ -529,39 +464,6 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, } static inline void -MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, - void *uop, unsigned int count) -{ - mcl->op = __HYPERVISOR_grant_table_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)uop; - mcl->args[2] = count; - - trace_xen_mc_entry(mcl, 3); -} - -static inline void -MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va, - pte_t new_val, unsigned long flags, - domid_t domid) -{ - mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl->args[0] = va; - if (sizeof(new_val) == sizeof(long)) { - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - mcl->args[3] = domid; - } else { - mcl->args[1] = new_val.pte; - mcl->args[2] = new_val.pte >> 32; - mcl->args[3] = flags; - mcl->args[4] = domid; - } - - trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5); -} - -static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { @@ -582,16 +484,6 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } static inline void -MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) -{ - mcl->op = __HYPERVISOR_memory_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)arg; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, int count, int *success_count, domid_t domid) { @@ -618,16 +510,6 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, } static inline void -MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) -{ - mcl->op = __HYPERVISOR_set_gdt; - mcl->args[0] = (unsigned long)frames; - mcl->args[1] = entries; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_stack_switch(struct multicall_entry *mcl, unsigned long ss, unsigned long esp) { diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index dde437f5d14f..158ad1483c43 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -108,7 +108,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) cx->type); } snprintf(cx->desc, - ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x", cx->address); out: return retval; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cb4a16292aa7..4c2313d0b9ca 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -702,6 +702,10 @@ static void __init l1tf_select_mitigation(void) half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", + half_pa); + pr_info("However, doing so will make a part of your RAM unusable.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); return; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 374d1aa66952..ceb67cd5918f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -113,21 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif -#ifndef CONFIG_X86_64 -/* - * On 32-bit systems it would be difficult to safely unmap a poison page - * from the kernel 1:1 map because there are no non-canonical addresses that - * we can use to refer to the address without risking a speculative access. - * However, this isn't much of an issue because: - * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which - * are only mapped into the kernel as needed - * 2) Few people would run a 32-bit kernel on a machine that supports - * recoverable errors because they have too much memory to boot 32-bit. - */ -static inline void mce_unmap_kpfn(unsigned long pfn) {} -#define mce_unmap_kpfn mce_unmap_kpfn -#endif - struct mca_config { bool dont_log_ce; bool cmci_disabled; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b767284b7f5..953b3ce92dcc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -42,6 +42,7 @@ #include <linux/irq_work.h> #include <linux/export.h> #include <linux/jump_label.h> +#include <linux/set_memory.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -50,7 +51,6 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> -#include <asm/set_memory.h> #include "mce-internal.h" @@ -108,10 +108,6 @@ static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); -#ifndef mce_unmap_kpfn -static void mce_unmap_kpfn(unsigned long pfn); -#endif - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -602,7 +598,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) - mce_unmap_kpfn(pfn); + set_mce_nospec(pfn); } return NOTIFY_OK; @@ -1072,38 +1068,10 @@ static int do_memory_failure(struct mce *m) if (ret) pr_err("Memory error not recovered"); else - mce_unmap_kpfn(m->addr >> PAGE_SHIFT); + set_mce_nospec(m->addr >> PAGE_SHIFT); return ret; } -#ifndef mce_unmap_kpfn -static void mce_unmap_kpfn(unsigned long pfn) -{ - unsigned long decoy_addr; - - /* - * Unmap this page from the kernel 1:1 mappings to make sure - * we don't log more errors because of speculative access to - * the page. - * We would like to just call: - * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); - * but doing that would radically increase the odds of a - * speculative access to the poison page because we'd have - * the virtual address of the kernel 1:1 mapping sitting - * around in registers. - * Instead we get tricky. We create a non-canonical address - * that looks just like the one we want, but has bit 63 flipped. - * This relies on set_memory_np() not checking whether we passed - * a legal address. - */ - - decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - - if (set_memory_np(decoy_addr, 1)) - pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); -} -#endif - /* * Cases where we avoid rendezvous handler timeout: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0f471bd93417..d9b71924c23c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,6 +45,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/tlb.h> static int kvmapf = 1; @@ -636,8 +637,10 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; + } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 930c88341e4e..afdb303285f8 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -41,6 +41,7 @@ #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> +#include <asm/tlb.h> /* * nop stub, which must not clobber anything *including the stack* to @@ -409,6 +410,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_one_user = native_flush_tlb_one_user, .flush_tlb_others = native_flush_tlb_others, + .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page, .pgd_alloc = __paravirt_pgd_alloc, .pgd_free = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index acfd04121da3..7ba73fe0d917 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -40,8 +40,14 @@ int iommu_detected __read_mostly = 0; * devices and allow every device to access to whole physical memory. This is * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. + * It is also possible to disable by default in kernel config, and enable with + * iommu=nopt at boot time. */ +#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH +int iommu_pass_through __read_mostly = 1; +#else int iommu_pass_through __read_mostly; +#endif extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; @@ -135,6 +141,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "nopt", 4)) + iommu_pass_through = 0; gart_parse_options(p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 476e3ddf8890..a451bc374b9b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -384,6 +384,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) start_thread_common(regs, new_ip, new_sp, __USER_CS, __USER_DS, 0); } +EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 73e27a98456f..6276140044d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5586,8 +5586,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); - local_irq_enable(); - /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there @@ -5596,6 +5594,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + local_irq_enable(); + asm volatile ( "push %%" _ASM_BP "; \n\t" "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" @@ -5718,12 +5718,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - reload_tss(vcpu); local_irq_disable(); + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1519f030fd73..1d26f3c4985b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,12 +198,14 @@ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_ static const struct { const char *option; - enum vmx_l1d_flush_state cmd; + bool for_parse; } vmentry_l1d_param[] = { - {"auto", VMENTER_L1D_FLUSH_AUTO}, - {"never", VMENTER_L1D_FLUSH_NEVER}, - {"cond", VMENTER_L1D_FLUSH_COND}, - {"always", VMENTER_L1D_FLUSH_ALWAYS}, + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, }; #define L1D_CACHE_ORDER 4 @@ -219,15 +221,15 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } - } + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } /* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { @@ -287,8 +289,9 @@ static int vmentry_l1d_flush_parse(const char *s) if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { - if (sysfs_streq(s, vmentry_l1d_param[i].option)) - return vmentry_l1d_param[i].cmd; + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; } } return -EINVAL; @@ -298,13 +301,13 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) { int l1tf, ret; - if (!boot_cpu_has(X86_BUG_L1TF)) - return 0; - l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf; + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + /* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let @@ -324,6 +327,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } @@ -1684,6 +1690,12 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -4551,7 +4563,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -6648,6 +6661,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9314,6 +9330,17 @@ fail: return 1; } +static int handle_encls(struct kvm_vcpu *vcpu) +{ + /* + * SGX virtualization is not yet supported. There is no software + * enable bit for SGX, so we have to trap ENCLS and inject a #UD + * to prevent the guest from executing ENCLS. + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -9371,6 +9398,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, }; static const int kvm_vmx_max_exit_handlers = @@ -9741,6 +9769,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return false; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return false; default: return true; } @@ -10100,9 +10131,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -#define L1D_CACHE_ORDER 4 -static void *vmx_l1d_flush_pages; - static void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -12101,6 +12129,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) vmcs_write64(APIC_ACCESS_ADDR, -1ull); + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7dff0457846..506bd2b4b8bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6576,14 +6576,12 @@ static void kvm_set_mmio_spte_mask(void) /* Set the present bit. */ mask |= 1ull; -#ifdef CONFIG_X86_64 /* * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (maxphyaddr == 52) + if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) mask &= ~1ull; -#endif kvm_mmu_set_mmio_spte_mask(mask, mask); } @@ -7305,8 +7303,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, + bool blockable) { unsigned long apic_address; @@ -7317,6 +7316,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index b45f5aaefd74..076ebdce9bd4 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -2,6 +2,8 @@ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/kcore.h> #include <asm/cpu_entry_area.h> #include <asm/pgtable.h> @@ -13,6 +15,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -146,10 +149,40 @@ static void __init setup_cpu_entry_area(int cpu) cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + /* + * The cpu_entry_area alias addresses are not in the kernel binary + * so they do not show up in /proc/kcore normally. This adds entries + * for them manually. + */ + kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu), + _entry_trampoline, + &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE); #endif percpu_setup_debug_store(cpu); } +#ifdef CONFIG_X86_64 +int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name) +{ + unsigned int cpu, ncpu = 0; + + if (symnum >= num_possible_cpus()) + return -EINVAL; + + for_each_possible_cpu(cpu) { + if (ncpu++ >= symnum) + break; + } + + *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; + *type = 't'; + strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); + + return 0; +} +#endif + static __init void setup_cpu_entry_area_ptes(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index acfab322fbe0..7a8fc26c1115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -99,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num) } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_pages: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_find_in_range( + min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); + } + if (ret) + memblock_reserve(ret, PAGE_SIZE * num); + else if (can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + if (!ret) panic("alloc_low_pages: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; @@ -923,7 +930,7 @@ unsigned long max_swapfile_size(void) if (boot_cpu_has_bug(X86_BUG_L1TF)) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ - unsigned long l1tf_limit = l1tf_pfn_limit() + 1; + unsigned long long l1tf_limit = l1tf_pfn_limit(); /* * We encode swap offsets also with 3 bits below those for pfn * which makes the usable limit higher. @@ -931,7 +938,7 @@ unsigned long max_swapfile_size(void) #if CONFIG_PGTABLE_LEVELS > 2 l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; #endif - pages = min_t(unsigned long, l1tf_limit, pages); + pages = min_t(unsigned long long, l1tf_limit, pages); } return pages; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index f40ab8185d94..1e95d57760cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -257,7 +257,7 @@ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) /* If it's real memory always allow */ if (pfn_valid(pfn)) return true; - if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) return false; return true; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1555bd7d3449..3d0c83ef6aab 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } +static u64 sanitize_phys(u64 address) +{ + /* + * When changing the memtype for pages containing poison allow + * for a "decoy" virtual address (bit 63 clear) passed to + * set_memory_X(). __pa() on a "decoy" address results in a + * physical address with bit 63 set. + */ + return address & __PHYSICAL_MASK; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_MODE_WB @@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, int is_range_ram; int err = 0; + start = sanitize_phys(start); + end = sanitize_phys(end); BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled()) { @@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end) if (!pat_enabled()) return 0; + start = sanitize_phys(start); + end = sanitize_phys(end); + /* Low ISA region is always mapped WB. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) return 0; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ef095c70ae3..e848a4811785 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + paravirt_tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 752dbf4e0e50..9517d1b2a281 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; - bool need_flush; - u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * Even in lazy TLB mode, the CPU should stay set in the - * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * If the CPU is not in lazy TLB mode, we are just switching - * from one thread in a process to another thread in the same - * process. No TLB flush required. - */ - if (!was_lazy) - return; - - /* - * Read the tlb_gen to check whether a flush is needed. - * If the TLB is up to date, just use it. - * The barrier synchronizes with the tlb_gen increment in - * the TLB shootdown code. - */ - smp_mb(); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == - next_tlb_gen) - return; - - /* - * TLB contents went out of date while we were in lazy - * mode. Fall through to the TLB switching code below. - */ - new_asid = prev_asid; - need_flush = true; + return; } else { + u16 new_asid; + bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } } /* @@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. - * - * This should be rare, with native_flush_tlb_others skipping - * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { - cpumask_var_t lazymask; - unsigned int cpu; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ + unsigned int cpu; + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) @@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - - /* - * A temporary cpumask is used in order to skip sending IPIs - * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). - * If the allocation fails, simply IPI every CPU in mm_cpumask. - */ - if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - - cpumask_copy(lazymask, cpumask); - - for_each_cpu(cpu, lazymask) { - if (per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_clear_cpu(cpu, lazymask); - } - - smp_call_function_many(lazymask, flush_tlb_func_remote, + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); - - free_cpumask_var(lazymask); } /* @@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } -void tlb_flush_remove_tables_local(void *arg) -{ - struct mm_struct *mm = arg; - - if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && - this_cpu_read(cpu_tlbstate.is_lazy)) { - /* - * We're in lazy mode. We need to at least flush our - * paging-structure cache to avoid speculatively reading - * garbage into our TLB. Since switching to init_mm is barely - * slower than a minimal flush, just switch to init_mm. - */ - switch_mm_irqs_off(NULL, &init_mm, NULL); - } -} - -static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, - struct cpumask *lazy_cpus) -{ - int cpu; - - for_each_cpu(cpu, mm_cpumask(mm)) { - if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_set_cpu(cpu, lazy_cpus); - } -} - -void tlb_flush_remove_tables(struct mm_struct *mm) -{ - int cpu = get_cpu(); - cpumask_var_t lazy_cpus; - - if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { - put_cpu(); - return; - } - - if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { - /* - * If the cpumask allocation fails, do a brute force flush - * on all the CPUs that have this mm loaded. - */ - smp_call_function_many(mm_cpumask(mm), - tlb_flush_remove_tables_local, (void *)mm, 1); - put_cpu(); - return; - } - - /* - * CPUs with !is_lazy either received a TLB flush IPI while the user - * pages in this address range were unmapped, or have context switched - * and reloaded %CR3 since then. - * - * Shootdown IPIs at page table freeing time only need to be sent to - * CPUs that may have out of date TLB contents. - */ - mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); - smp_call_function_many(lazy_cpus, - tlb_flush_remove_tables_local, (void *)mm, 1); - free_cpumask_var(lazy_cpus); - put_cpu(); -} static void do_flush_tlb_all(void *info) { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ee3b00c7acda..52a7c3faee0c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -122,6 +122,8 @@ static void __init xen_banner(void) static void __init xen_pv_init_platform(void) { + populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -1170,13 +1172,13 @@ static void __init xen_boot_params_init_edd(void) * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. */ -static void xen_setup_gdt(int cpu) +static void __init xen_setup_gdt(int cpu) { pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; - setup_stack_canary_segment(0); - switch_to_new_gdt(0); + setup_stack_canary_segment(cpu); + switch_to_new_gdt(cpu); pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; pv_cpu_ops.load_gdt = xen_load_gdt; @@ -1385,8 +1387,11 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_boot_params_init_edd(); } - add_preferred_console("tty", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 52206ad81e4b..45b700ac5fe7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -67,6 +67,7 @@ #include <asm/init.h> #include <asm/pat.h> #include <asm/smp.h> +#include <asm/tlb.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -2171,6 +2172,8 @@ void __init xen_relocate_p2m(void) #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); +RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE); +RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE); static void __init xen_write_cr3_init(unsigned long cr3) { @@ -2397,6 +2400,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_others = xen_flush_tlb_others, + .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 6e0d2086eacb..1163e33121fb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -906,37 +906,6 @@ char * __init xen_memory_setup(void) } /* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); - set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_table.nr_entries = memmap.nr_entries; - - e820__update_table(&xen_e820_table); - - for (i = 0; i < xen_e820_table.nr_entries; i++) - e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - -/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e78684597f57..0e60bd918695 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,7 +50,6 @@ void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); -char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); |