diff options
Diffstat (limited to 'arch/x86')
220 files changed, 5653 insertions, 4559 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ccd5ff7d1879..64024c999531 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,28 +9,50 @@ config 64BIT config X86_32 def_bool y depends on !64BIT + # Options that are inherently 32-bit kernel only: + select ARCH_WANT_IPC_PARSE_VERSION + select CLKSRC_I8253 + select CLONE_BACKWARDS + select HAVE_AOUT + select HAVE_GENERIC_DMA_COHERENT + select MODULES_USE_ELF_REL + select OLD_SIGACTION config X86_64 def_bool y depends on 64BIT + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA + select X86_DEV_DMA_OPS -### Arch settings +# +# Arch settings +# +# ( Note that options that are marked 'if X86_64' could in principle be +# ported to 32-bit as well. ) +# config X86 def_bool y + # + # Note: keep this list sorted alphabetically + # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK - select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if X86_64 select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH + select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_SG_CHAIN select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -39,23 +61,17 @@ config X86 select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP - select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 - select CLKSRC_I8253 if X86_32 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select CLONE_BACKWARDS if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT @@ -77,7 +93,6 @@ config X86 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE @@ -88,12 +103,10 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER - select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_WITHIN_STACK_FRAMES - select HAVE_EBPF_JIT if X86_64 select HAVE_ARCH_VMAP_STACK if X86_64 + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@ -106,6 +119,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EBPF_JIT if X86_64 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 @@ -113,7 +127,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT @@ -142,15 +155,11 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING - select MODULES_USE_ELF_RELA if X86_64 - select MODULES_USE_ELF_REL if X86_32 - select OLD_SIGACTION if X86_32 - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select PERF_EVENTS select RTC_LIB select RTC_MC146818_LIB @@ -160,11 +169,7 @@ config X86 select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS - select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS - select HAVE_STACK_VALIDATION if X86_64 - select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS - select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS config INSTRUCTION_DECODER def_bool y @@ -927,6 +932,27 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC_PRIO + bool "CPU core priorities scheduler support" + depends on SCHED_MC && CPU_SUP_INTEL + select X86_INTEL_PSTATE + select CPU_FREQ + default y + ---help--- + Intel Turbo Boost Max Technology 3.0 enabled CPUs have a + core ordering determined at manufacturing time, which allows + certain cores to reach higher turbo frequencies (when running + single threaded workloads) than others. + + Enabling this kernel feature teaches the scheduler about + the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the + scheduler's CPU selection logic accordingly, so that higher + overall system performance can be achieved. + + This feature will have no effect on CPUs without this feature. + + If unsure say Y here. + source "kernel/Kconfig.preempt" config UP_LATE_INIT @@ -1013,7 +1039,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && AMD_NB ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -1513,7 +1539,7 @@ config X86_CHECK_BIOS_CORRUPTION line. By default it scans the low 64k of memory every 60 seconds; see the memory_corruption_check_size and memory_corruption_check_period parameters in - Documentation/kernel-parameters.txt to adjust this. + Documentation/admin-guide/kernel-parameters.rst to adjust this. When enabled with the default parameters, this option has almost no overhead, as it reserves a relatively small amount @@ -1725,6 +1751,8 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS def_bool y # Note: only available in 64-bit mode depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS ---help--- Memory Protection Keys provides a mechanism for enforcing page-based protections, but without requiring modification of the @@ -2080,7 +2108,7 @@ config DEBUG_HOTPLUG_CPU0 config COMPAT_VDSO def_bool n prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" - depends on X86_32 || IA32_EMULATION + depends on COMPAT_32 ---help--- Certain buggy versions of glibc will crash if they are presented with a 32-bit vDSO that is not mapped at the address @@ -2682,9 +2710,10 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF select COMPAT_BINFMT_ELF - select ARCH_WANT_OLD_COMPAT_IPC + select COMPAT_OLD_SIGACTION ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2709,6 +2738,12 @@ config X86_X32 elf32_x86_64 support enabled to compile a kernel with this option set. +config COMPAT_32 + def_bool y + depends on IA32_EMULATION || X86_32 + select HAVE_UID16 + select OLD_SIGSUSPEND3 + config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 12ea8f8384f4..0d810fb15eac 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -65,7 +65,7 @@ clean-files += cpustr.h # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE := n diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 536ccfcc01c6..44163e8c3868 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 @@ -40,8 +40,8 @@ GCOV_PROFILE := n UBSAN_SANITIZE :=n LDFLAGS := -m elf_$(UTS_MACHINE) -ifeq ($(CONFIG_RELOCATABLE),y) -# If kernel is relocatable, build compressed kernel as PIE. +# Compressed kernel should be built as PIE since it may be loaded at any +# address by the bootloader. ifeq ($(CONFIG_X86_32),y) LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) else @@ -51,7 +51,6 @@ else LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ && echo "-z noreloc-overflow -pie --no-dynamic-linker") endif -endif LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cc69e37548db..ff01c8fc76f7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -537,6 +537,69 @@ free_handle: efi_call_early(free_pool, pci_handle); } +static void retrieve_apple_device_properties(struct boot_params *boot_params) +{ + efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID; + struct setup_data *data, *new; + efi_status_t status; + u32 size = 0; + void *p; + + status = efi_call_early(locate_protocol, &guid, NULL, &p); + if (status != EFI_SUCCESS) + return; + + if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) { + efi_printk(sys_table, "Unsupported properties proto version\n"); + return; + } + + efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size); + if (!size) + return; + + do { + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size + sizeof(struct setup_data), &new); + if (status != EFI_SUCCESS) { + efi_printk(sys_table, + "Failed to alloc mem for properties\n"); + return; + } + + status = efi_call_proto(apple_properties_protocol, get_all, p, + new->data, &size); + + if (status == EFI_BUFFER_TOO_SMALL) + efi_call_early(free_pool, new); + } while (status == EFI_BUFFER_TOO_SMALL); + + new->type = SETUP_APPLE_PROPERTIES; + new->len = size; + new->next = 0; + + data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + if (!data) + boot_params->hdr.setup_data = (unsigned long)new; + else { + while (data->next) + data = (struct setup_data *)(unsigned long)data->next; + data->next = (unsigned long)new; + } +} + +static void setup_quirks(struct boot_params *boot_params) +{ + efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 }; + efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) + efi_table_attr(efi_system_table, fw_vendor, sys_table); + + if (!memcmp(fw_vendor, apple, sizeof(apple))) { + if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) + retrieve_apple_device_properties(boot_params); + } +} + static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1098,6 +1161,8 @@ struct boot_params *efi_main(struct efi_config *c, setup_efi_pci(boot_params); + setup_quirks(boot_params); + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index efdfba21a5b2..4d85e600db78 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -119,8 +119,7 @@ ENTRY(startup_32) */ /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax - movl %eax, gdt+2(%ebp) + addl %ebp, gdt+2(%ebp) lgdt gdt(%ebp) /* Enable PAE mode */ diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 26240dde081e..4224ede43b4e 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -87,6 +87,12 @@ int validate_cpu(void) return -1; } + if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) && + !has_eflag(X86_EFLAGS_ID)) { + printf("This kernel requires a CPU with the CPUID instruction. Build with CONFIG_M486=y to run on this CPU.\n"); + return -1; + } + if (err_flags) { puts("This kernel requires the following features " "not present on the CPU:\n"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 0ab5ee1c26af..31c34ee131f3 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> @@ -29,14 +28,14 @@ #include <crypto/cryptd.h> #include <crypto/ctr.h> #include <crypto/b128ops.h> -#include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/fpu/api.h> #include <asm/crypto/aes.h> -#include <crypto/ablk_helper.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #ifdef CONFIG_X86_64 @@ -45,28 +44,26 @@ #define AESNI_ALIGN 16 +#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) #define RFC4106_HASH_SUBKEY_SIZE 16 +#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) +#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. */ struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN))); - struct crypto_aes_ctx aes_key_expanded - __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; u8 nonce[4]; }; -struct aesni_lrw_ctx { - struct lrw_table_ctx lrw_table; - u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; -}; - struct aesni_xts_ctx { - u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; - u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -360,96 +357,95 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) aesni_dec(ctx, dst, src); } -static int ecb_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) +{ + return aes_set_key_common(crypto_skcipher_tfm(tfm), + crypto_skcipher_ctx(tfm), key, len); +} + +static int ecb_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int ecb_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } kernel_fpu_end(); @@ -458,7 +454,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, #ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct blkcipher_walk *walk) + struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u8 keystream[AES_BLOCK_SIZE]; @@ -491,157 +487,53 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, } #endif -static int ctr_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, true); kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; - err = blkcipher_walk_done(desc, &walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { ctr_crypt_final(ctx, &walk); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } kernel_fpu_end(); return err; } -#endif - -static int ablk_ecb_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); -} - -static int ablk_cbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); -} - -#ifdef CONFIG_X86_64 -static int ablk_ctr_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); -} - -#endif - -#if IS_ENABLED(CONFIG_CRYPTO_PCBC) -static int ablk_pcbc_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); -} -#endif - -static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_enc(ctx, blks, blks, nbytes); -} -static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) -{ - aesni_ecb_dec(ctx, blks, blks, nbytes); -} - -static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err; - err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, - keylen - AES_BLOCK_SIZE); + err = xts_verify_key(tfm, key, keylen); if (err) return err; - return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); -} - -static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) -{ - struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = lrw_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; + keylen /= 2; /* first half of xts-key is for crypt */ - err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); if (err) return err; /* second half of xts-key is for tweak */ - return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, - keylen / 2); + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); } @@ -650,8 +542,6 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) aesni_enc(ctx, out, in); } -#ifdef CONFIG_X86_64 - static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); @@ -698,83 +588,28 @@ static const struct common_glue_ctx aesni_dec_xts = { } } }; -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + return glue_xts_req_128bit(&aesni_enc_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(aesni_xts_tweak), - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); -} - -#else - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); - - return ret; -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[8]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = aesni_xts_tweak, - .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), - .crypt_fn = lrw_xts_decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - kernel_fpu_begin(); - ret = xts_crypt(desc, dst, src, nbytes, &req); - kernel_fpu_end(); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return ret; + return glue_xts_req_128bit(&aesni_dec_xts, req, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -#endif - -#ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_aead *aead) { struct cryptd_aead *cryptd_tfm; @@ -888,7 +723,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) unsigned long auth_tag_len = crypto_aead_authsize(tfm); u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk; + struct scatter_walk dst_sg_walk = {}; unsigned int i; /* Assuming we are supporting rfc4106 64-bit extended */ @@ -968,7 +803,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); u8 authTag[16]; struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk; + struct scatter_walk dst_sg_walk = {}; unsigned int i; if (unlikely(req->assoclen != 16 && req->assoclen != 20)) @@ -1077,9 +912,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1091,14 +924,12 @@ static struct crypto_alg aesni_algs[] = { { } } }, { - .cra_name = "__aes-aesni", - .cra_driver_name = "__driver-aes-aesni", - .cra_priority = 0, + .cra_name = "__aes", + .cra_driver_name = "__aes-aesni", + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { @@ -1109,250 +940,94 @@ static struct crypto_alg aesni_algs[] = { { .cia_decrypt = __aes_decrypt } } -}, { - .cra_name = "__ecb-aes-aesni", - .cra_driver_name = "__driver-ecb-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-aes-aesni", - .cra_driver_name = "__driver-cbc-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ecb_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, +} }; + +static struct skcipher_alg aesni_skciphers[] = { + { + .base = { + .cra_name = "__ecb(aes)", + .cra_driver_name = "__ecb-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_cbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(aes)", + .cra_driver_name = "__cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, #ifdef CONFIG_X86_64 -}, { - .cra_name = "__ctr-aes-aesni", - .cra_driver_name = "__driver-ctr-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_aes_ctx) + - AESNI_ALIGN - 1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__ctr(aes)", + .cra_driver_name = "__ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base = { + .cra_name = "__xts(aes)", + .cra_driver_name = "__xts-aes-aesni", + .cra_priority = 401, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = XTS_AES_CTX_SIZE, + .cra_module = THIS_MODULE, }, - }, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, #endif + } +}; + +struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; + +struct { + const char *algname; + const char *drvname; + const char *basename; + struct simd_skcipher_alg *simd; +} aesni_simd_skciphers2[] = { #if IS_ENABLED(CONFIG_CRYPTO_PCBC) -}, { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_pcbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, + { + .algname = "pcbc(aes)", + .drvname = "pcbc-aes-aesni", + .basename = "fpu(pcbc(__aes-aesni))", }, #endif -}, { - .cra_name = "__lrw-aes-aesni", - .cra_driver_name = "__driver-lrw-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_aesni_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = lrw_aesni_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-aes-aesni", - .cra_driver_name = "__driver-xts-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aesni_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; +}; #ifdef CONFIG_X86_64 static struct aead_alg aesni_aead_algs[] = { { @@ -1401,9 +1076,27 @@ static const struct x86_cpu_id aesni_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); +static void aesni_free_simds(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) && + aesni_simd_skciphers[i]; i++) + simd_skcipher_free(aesni_simd_skciphers[i]); + + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2) && + aesni_simd_skciphers2[i].simd; i++) + simd_skcipher_free(aesni_simd_skciphers2[i].simd); +} + static int __init aesni_init(void) { + struct simd_skcipher_alg *simd; + const char *basename; + const char *algname; + const char *drvname; int err; + int i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1445,13 +1138,48 @@ static int __init aesni_init(void) if (err) goto fpu_exit; + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); + if (err) + goto unregister_algs; + err = crypto_register_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); if (err) - goto unregister_algs; + goto unregister_skciphers; + + for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) { + algname = aesni_skciphers[i].base.cra_name + 2; + drvname = aesni_skciphers[i].base.cra_driver_name + 2; + basename = aesni_skciphers[i].base.cra_driver_name; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + + aesni_simd_skciphers[i] = simd; + } - return err; + for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) { + algname = aesni_simd_skciphers2[i].algname; + drvname = aesni_simd_skciphers2[i].drvname; + basename = aesni_simd_skciphers2[i].basename; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + aesni_simd_skciphers2[i].simd = simd; + } + + return 0; + +unregister_simds: + aesni_free_simds(); + crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); +unregister_skciphers: + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_algs: crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); fpu_exit: @@ -1461,7 +1189,10 @@ fpu_exit: static void __exit aesni_exit(void) { + aesni_free_simds(); crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0857b1a1de3b..c194d5717ae5 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,26 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#if defined(X86_FEATURE_EAGER_FPU) -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) -#else -#define set_pcl_breakeven_point() \ - (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) -#endif #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -190,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -202,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -261,7 +248,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(&alg); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index e7d679e2a018..406680476c52 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -11,143 +11,186 @@ * */ -#include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> struct crypto_fpu_ctx { - struct crypto_blkcipher *child; + struct crypto_skcipher *child; }; -static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, +static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); - struct crypto_blkcipher *child = ctx->child; + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent); + struct crypto_skcipher *child = ctx->child; int err; - crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - err = crypto_blkcipher_setkey(child, key, keylen); - crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); + crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_skcipher_setkey(child, key, keylen); + crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); return err; } -static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_encrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_encrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) +static int crypto_fpu_decrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_REQUEST_ON_STACK(subreq, child); int err; - struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); - struct crypto_blkcipher *child = ctx->child; - struct blkcipher_desc desc = { - .tfm = child, - .info = desc_in->info, - .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, - }; + + skcipher_request_set_tfm(subreq, child); + skcipher_request_set_callback(subreq, 0, NULL, NULL); + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, + req->iv); kernel_fpu_begin(); - err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + err = crypto_skcipher_decrypt(subreq); kernel_fpu_end(); + + skcipher_request_zero(subreq); return err; } -static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm) { - struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); - struct crypto_spawn *spawn = crypto_instance_ctx(inst); - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - struct crypto_blkcipher *cipher; + struct skcipher_instance *inst = skcipher_alg_instance(tfm); + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher_spawn *spawn; + struct crypto_skcipher *cipher; - cipher = crypto_spawn_blkcipher(spawn); + spawn = skcipher_instance_ctx(inst); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; + return 0; } -static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(ctx->child); +} + +static void crypto_fpu_free(struct skcipher_instance *inst) { - struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(ctx->child); + crypto_drop_skcipher(skcipher_instance_ctx(inst)); + kfree(inst); } -static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb) { - struct crypto_instance *inst; - struct crypto_alg *alg; + struct crypto_skcipher_spawn *spawn; + struct skcipher_instance *inst; + struct crypto_attr_type *algt; + struct skcipher_alg *alg; + const char *cipher_name; int err; - err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + algt = crypto_get_attr_type(tb); + if (IS_ERR(algt)) + return PTR_ERR(algt); + + if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) & + algt->mask) + return -EINVAL; + + if (!(algt->mask & CRYPTO_ALG_INTERNAL)) + return -EINVAL; + + cipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(cipher_name)) + return PTR_ERR(cipher_name); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + spawn = skcipher_instance_ctx(inst); + + crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst)); + err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC); if (err) - return ERR_PTR(err); - - alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, - CRYPTO_ALG_TYPE_MASK); - if (IS_ERR(alg)) - return ERR_CAST(alg); - - inst = crypto_alloc_instance("fpu", alg); - if (IS_ERR(inst)) - goto out_put_alg; - - inst->alg.cra_flags = alg->cra_flags; - inst->alg.cra_priority = alg->cra_priority; - inst->alg.cra_blocksize = alg->cra_blocksize; - inst->alg.cra_alignmask = alg->cra_alignmask; - inst->alg.cra_type = alg->cra_type; - inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; - inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; - inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; - inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); - inst->alg.cra_init = crypto_fpu_init_tfm; - inst->alg.cra_exit = crypto_fpu_exit_tfm; - inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; - inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; - inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; - -out_put_alg: - crypto_mod_put(alg); - return inst; -} + goto out_free_inst; -static void crypto_fpu_free(struct crypto_instance *inst) -{ - crypto_drop_spawn(crypto_instance_ctx(inst)); + alg = crypto_skcipher_spawn_alg(spawn); + + err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu", + &alg->base); + if (err) + goto out_drop_skcipher; + + inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL; + inst->alg.base.cra_priority = alg->base.cra_priority; + inst->alg.base.cra_blocksize = alg->base.cra_blocksize; + inst->alg.base.cra_alignmask = alg->base.cra_alignmask; + + inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg); + inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg); + inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg); + + inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + + inst->alg.init = crypto_fpu_init_tfm; + inst->alg.exit = crypto_fpu_exit_tfm; + + inst->alg.setkey = crypto_fpu_setkey; + inst->alg.encrypt = crypto_fpu_encrypt; + inst->alg.decrypt = crypto_fpu_decrypt; + + inst->free = crypto_fpu_free; + + err = skcipher_register_instance(tmpl, inst); + if (err) + goto out_drop_skcipher; + +out: + return err; + +out_drop_skcipher: + crypto_drop_skcipher(spawn); +out_free_inst: kfree(inst); + goto out; } static struct crypto_template crypto_fpu_tmpl = { .name = "fpu", - .alloc = crypto_fpu_alloc, - .free = crypto_fpu_free, + .create = crypto_fpu_create, .module = THIS_MODULE, }; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 6a85598931b5..260a060d7275 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -27,10 +27,10 @@ #include <linux/module.h> #include <crypto/b128ops.h> +#include <crypto/internal/skcipher.h> #include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, @@ -339,6 +339,41 @@ done: return nbytes; } +static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, + void *ctx, + struct skcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = walk->src.virt.addr; + u128 *dst = walk->dst.virt.addr; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.xts(ctx, dst, src, + walk->iv); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + return nbytes; +} + /* for implementations implementing faster XTS IV generator */ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, @@ -379,6 +414,43 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, } EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); +int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx) +{ + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; + bool fpu_enabled = false; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + /* set minimum length to bsize, for tweak_fn */ + fpu_enabled = glue_skwalk_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, + nbytes < bsize ? bsize : nbytes); + + /* calculate first value of T */ + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { + nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + + err = skcipher_walk_done(&walk, nbytes); + nbytes = walk.nbytes; + } + + glue_fpu_end(fpu_enabled); + + return err; +} +EXPORT_SYMBOL_GPL(glue_xts_req_128bit); + void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn) { diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index 9e5b67127a09..acf9fdf01671 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -114,7 +114,7 @@ static inline void sha1_init_digest(uint32_t *digest) } static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index 98a35bcc6f4a..13590ccf965c 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -125,7 +125,7 @@ struct sha1_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 6f97fb33ae21..7926a226b120 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -115,7 +115,7 @@ inline void sha256_init_digest(uint32_t *digest) } inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h index edd252b73206..aabb30320af0 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -125,7 +125,7 @@ struct sha256_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index d210174a52b0..9c1bb6d58141 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -117,7 +117,7 @@ inline void sha512_init_digest(uint64_t *digest) } inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], - uint32_t total_len) + uint64_t total_len) { uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h index 9d4b2c8208d5..e4653f5eec3f 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -119,7 +119,7 @@ struct sha512_hash_ctx { /* error flag */ int error; - uint32_t total_length; + uint64_t total_length; const void *incoming_buffer; uint32_t incoming_buffer_length; uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 77f28ce9c646..9976fcecd17e 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -5,8 +5,8 @@ OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y -CFLAGS_syscall_64.o += -Wno-override-init -CFLAGS_syscall_32.o += -Wno-override-init +CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9a9e5884066c..05ed3d393da7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -90,8 +90,8 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - addq $-(15*8+\addskip), %rsp + .macro ALLOC_PT_GPREGS_ON_STACK + addq $-(15*8), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 @@ -147,15 +147,6 @@ For 32-bit we have the following conventions - kernel is built with movq 5*8+\offset(%rsp), %rbx .endm - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 @@ -201,6 +192,26 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +#ifdef CONFIG_FRAME_POINTER + .if \ptregs_offset + leaq \ptregs_offset(%rsp), %rbp + .else + mov %rsp, %rbp + .endif + orq $0x1, %rbp +#endif +.endm + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21b352a11b49..701d29f8e4d3 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,6 +45,7 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> +#include <asm/frame.h> .section .entry.text, "ax" @@ -175,6 +176,22 @@ SET_KERNEL_GS %edx .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original rbp. + */ +.macro ENCODE_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER + mov %esp, %ebp + orl $0x1, %ebp +#endif +.endm + .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -238,6 +255,23 @@ ENTRY(__switch_to_asm) END(__switch_to_asm) /* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) +/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -245,9 +279,7 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ @@ -307,13 +339,13 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: +.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq - jmp need_resched + jmp .Lneed_resched END(resume_kernel) #endif @@ -334,7 +366,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) */ ENTRY(xen_sysenter_target) addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + jmp .Lsysenter_past_esp #endif /* @@ -371,7 +403,7 @@ ENTRY(xen_sysenter_target) */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp -sysenter_past_esp: +.Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ @@ -504,9 +536,9 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET -restore_all_notrace: +.Lrestore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* @@ -518,22 +550,23 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + je .Lldt_ss # returning to user-space with LDT SS #endif -restore_nocheck: +.Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: +.Lirq_return: INTERRUPT_RETURN + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error - jmp error_code + jmp common_exception .previous - _ASM_EXTABLE(irq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 -ldt_ss: +.Lldt_ss: /* * Setup and switch to ESPFIX stack * @@ -562,7 +595,7 @@ ldt_ss: */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + jmp .Lrestore_nocheck #endif ENDPROC(entry_INT80_32) @@ -624,6 +657,7 @@ common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax call do_IRQ @@ -635,6 +669,7 @@ ENTRY(name) \ ASM_CLAC; \ pushl $~(nr); \ SAVE_ALL; \ + ENCODE_FRAME_POINTER; \ TRACE_IRQS_OFF \ movl %esp, %eax; \ call fn; \ @@ -659,7 +694,7 @@ ENTRY(coprocessor_error) ASM_CLAC pushl $0 pushl $do_coprocessor_error - jmp error_code + jmp common_exception END(coprocessor_error) ENTRY(simd_coprocessor_error) @@ -673,14 +708,14 @@ ENTRY(simd_coprocessor_error) #else pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp common_exception END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC pushl $-1 # mark this as an int pushl $do_device_not_available - jmp error_code + jmp common_exception END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -694,59 +729,59 @@ ENTRY(overflow) ASM_CLAC pushl $0 pushl $do_overflow - jmp error_code + jmp common_exception END(overflow) ENTRY(bounds) ASM_CLAC pushl $0 pushl $do_bounds - jmp error_code + jmp common_exception END(bounds) ENTRY(invalid_op) ASM_CLAC pushl $0 pushl $do_invalid_op - jmp error_code + jmp common_exception END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC pushl $0 pushl $do_coprocessor_segment_overrun - jmp error_code + jmp common_exception END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS - jmp error_code + jmp common_exception END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC pushl $do_segment_not_present - jmp error_code + jmp common_exception END(segment_not_present) ENTRY(stack_segment) ASM_CLAC pushl $do_stack_segment - jmp error_code + jmp common_exception END(stack_segment) ENTRY(alignment_check) ASM_CLAC pushl $do_alignment_check - jmp error_code + jmp common_exception END(alignment_check) ENTRY(divide_error) ASM_CLAC pushl $0 # no error code pushl $do_divide_error - jmp error_code + jmp common_exception END(divide_error) #ifdef CONFIG_X86_MCE @@ -754,7 +789,7 @@ ENTRY(machine_check) ASM_CLAC pushl $0 pushl machine_check_vector - jmp error_code + jmp common_exception END(machine_check) #endif @@ -762,13 +797,14 @@ ENTRY(spurious_interrupt_bug) ASM_CLAC pushl $0 pushl $do_spurious_interrupt_bug - jmp error_code + jmp common_exception END(spurious_interrupt_bug) #ifdef CONFIG_XEN ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF /* @@ -823,6 +859,7 @@ ENTRY(xen_failsafe_callback) jmp iret_exc 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER jmp ret_from_exception .section .fixup, "ax" @@ -882,15 +919,15 @@ ftrace_call: popl %edx popl %ecx popl %eax -ftrace_ret: +.Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: jmp ftrace_stub #endif -.globl ftrace_stub -ftrace_stub: +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) ret END(ftrace_caller) @@ -952,7 +989,7 @@ GLOBAL(ftrace_regs_call) popl %gs addl $8, %esp /* Skip orig_ax and ip */ popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + jmp .Lftrace_ret popf jmp ftrace_stub @@ -963,7 +1000,7 @@ ENTRY(mcount) jb ftrace_stub /* Paging not enabled yet? */ cmpl $ftrace_stub, ftrace_trace_function - jnz trace + jnz .Ltrace #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -976,7 +1013,7 @@ ftrace_stub: ret /* taken from glibc */ -trace: +.Ltrace: pushl %eax pushl %ecx pushl %edx @@ -1027,7 +1064,7 @@ return_to_handler: ENTRY(trace_page_fault) ASM_CLAC pushl $trace_do_page_fault - jmp error_code + jmp common_exception END(trace_page_fault) #endif @@ -1035,7 +1072,10 @@ ENTRY(page_fault) ASM_CLAC pushl $do_page_fault ALIGN -error_code: + jmp common_exception +END(page_fault) + +common_exception: /* the function address is in %gs's slot on the stack */ pushl %fs pushl %es @@ -1047,6 +1087,7 @@ error_code: pushl %edx pushl %ecx pushl %ebx + ENCODE_FRAME_POINTER cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1064,7 +1105,7 @@ error_code: movl %esp, %eax # pt_regs pointer call *%edi jmp ret_from_exception -END(page_fault) +END(common_exception) ENTRY(debug) /* @@ -1079,6 +1120,7 @@ ENTRY(debug) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer @@ -1094,11 +1136,11 @@ ENTRY(debug) .Ldebug_from_sysenter_stack: /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp TRACE_IRQS_OFF call do_debug - movl %ebp, %esp + movl %ebx, %esp jmp ret_from_exception END(debug) @@ -1116,11 +1158,12 @@ ENTRY(nmi) movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax - je nmi_espfix_stack + je .Lnmi_espfix_stack #endif pushl %eax # pt_regs->orig_ax SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1132,21 +1175,21 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp restore_all_notrace + jmp .Lrestore_all_notrace .Lnmi_from_sysenter_stack: /* * We're on the SYSENTER stack. Switch off. No one (not even debug) * is using the thread stack right now, so it's safe for us to use it. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi - movl %ebp, %esp - jmp restore_all_notrace + movl %ebx, %esp + jmp .Lrestore_all_notrace #ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: +.Lnmi_espfix_stack: /* * create the pointer to lss back */ @@ -1159,12 +1202,13 @@ nmi_espfix_stack: .endr pushl %eax SAVE_ALL + ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + jmp .Lirq_return #endif END(nmi) @@ -1172,6 +1216,7 @@ ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1181,14 +1226,14 @@ END(int3) ENTRY(general_protection) pushl $do_general_protection - jmp error_code + jmp common_exception END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp error_code + jmp common_exception END(async_page_fault) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a358b37..5b219707c2f2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,12 +38,6 @@ #include <asm/export.h> #include <linux/err.h> -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - .code64 .section .entry.text, "ax" @@ -469,6 +463,7 @@ END(irq_entries_start) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER testb $3, CS(%rsp) jz 1f @@ -985,6 +980,7 @@ ENTRY(xen_failsafe_callback) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) @@ -1028,6 +1024,7 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx rdmsr @@ -1075,6 +1072,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1257,6 +1255,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + ENCODE_FRAME_POINTER /* * At this point we no longer need to worry about stack damage @@ -1270,11 +1269,10 @@ ENTRY(nmi) /* * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. Fortunately, - * do_nmi doesn't modify pt_regs. + * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 23c881caabd1..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } @@ -161,8 +161,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -189,14 +187,12 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size); + } else { + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; } up_fail: - if (ret) { - current->mm->context.vdso = NULL; - current->mm->context.vdso_image = NULL; - } - up_write(&mm->mmap_sem); return ret; } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f5f4b3fbbbc2..afb222b63cae 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -662,7 +662,13 @@ static int __init amd_core_pmu_init(void) pr_cont("Fam15h "); x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; break; - + case 0x17: + pr_cont("Fam17h "); + /* + * In family 17h, there are no event constraints in the PMC hardware. + * We fallback to using default amd_get_event_constraints. + */ + break; default: pr_err("core perfctr but no constraints; unknown hardware!\n"); return -ENODEV; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d31735f37ed7..f1c22584a46f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -69,7 +69,7 @@ u64 x86_perf_event_update(struct perf_event *event) int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; int idx = hwc->idx; - s64 delta; + u64 delta; if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; @@ -365,7 +365,11 @@ int x86_add_exclusive(unsigned int what) { int i; - if (x86_pmu.lbr_pt_coexist) + /* + * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. + * LBR and BTS are still mutually exclusive. + */ + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return 0; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { @@ -388,7 +392,7 @@ fail_unlock: void x86_del_exclusive(unsigned int what) { - if (x86_pmu.lbr_pt_coexist) + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); @@ -2299,7 +2303,7 @@ valid_user_frame(const void __user *fp, unsigned long size) static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; - int idx = segment >> 3; + unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL @@ -2352,7 +2356,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent frame.next_frame = 0; frame.return_address = 0; - if (!access_ok(VERIFY_READ, fp, 8)) + if (!valid_user_frame(fp, sizeof(frame))) break; bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); @@ -2362,9 +2366,6 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent if (bytes != 0) break; - if (!valid_user_frame(fp, sizeof(frame))) - break; - perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } @@ -2413,7 +2414,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs frame.next_frame = NULL; frame.return_address = 0; - if (!access_ok(VERIFY_READ, fp, sizeof(*fp) * 2)) + if (!valid_user_frame(fp, sizeof(frame))) break; bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); @@ -2423,9 +2424,6 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs if (bytes != 0) break; - if (!valid_user_frame(fp, sizeof(frame))) - break; - perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index eab0915f5995..cb8522290e6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3607,10 +3607,14 @@ __init int intel_pmu_init(void) /* * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: + * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + if (version > 1) { + int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); + + x86_pmu.num_counters_fixed = + max((int)edx.split.num_counters_fixed, assume); + } if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -4030,7 +4034,7 @@ __init int intel_pmu_init(void) /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { - x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; x86_pmu.perfctr = MSR_IA32_PMC0; pr_cont("full-width counters, "); } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 3ca87b5a8677..da51e5a3e2ff 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -48,7 +48,8 @@ * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW + * SKL,KNL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -56,15 +57,16 @@ * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL + * Available model: SNB,IVB,HSW,BDW,SKL,KNL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW + * SKL,KNL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -118,6 +120,7 @@ struct cstate_model { /* Quirk flags */ #define SLM_PKG_C6_USE_C7_MSR (1UL << 0) +#define KNL_CORE_C6_MSR (1UL << 1) struct perf_cstate_msr { u64 msr; @@ -488,6 +491,18 @@ static const struct cstate_model slm_cstates __initconst = { .quirks = SLM_PKG_C6_USE_C7_MSR, }; + +static const struct cstate_model knl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES), + .quirks = KNL_CORE_C6_MSR, +}; + + + #define X86_CSTATES_MODEL(model, states) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } @@ -523,6 +538,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); @@ -558,6 +576,11 @@ static int __init cstate_probe(const struct cstate_model *cm) if (cm->quirks & SLM_PKG_C6_USE_C7_MSR) pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + /* KNL has different MSR for CORE C6 */ + if (cm->quirks & KNL_CORE_C6_MSR) + pkg_msr[PERF_CSTATE_CORE_C6_RES].msr = MSR_KNL_CORE_C6_RESIDENCY; + + has_cstate_core = cstate_probe_msr(cm->core_events, PERF_CSTATE_CORE_EVENT_MAX, core_msr, core_events_attrs); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0319311dbdbb..be202390bbd3 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1108,20 +1108,20 @@ static void setup_pebs_sample_data(struct perf_event *event, } /* - * We use the interrupt regs as a base because the PEBS record - * does not contain a full regs set, specifically it seems to - * lack segment descriptors, which get used by things like - * user_mode(). + * We use the interrupt regs as a base because the PEBS record does not + * contain a full regs set, specifically it seems to lack segment + * descriptors, which get used by things like user_mode(). * - * In the simple case fix up only the IP and BP,SP regs, for - * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. - * A possible PERF_SAMPLE_REGS will have to transfer all regs. + * In the simple case fix up only the IP for PERF_SAMPLE_IP. + * + * We must however always use BP,SP from iregs for the unwinder to stay + * sane; the record BP,SP can point into thin air when the record is + * from a previous PMI context or an (I)RET happend between the record + * and PMI. */ *regs = *iregs; regs->flags = pebs->flags; set_linear_ip(regs, pebs->ip); - regs->bp = pebs->bp; - regs->sp = pebs->sp; if (sample_type & PERF_SAMPLE_REGS_INTR) { regs->ax = pebs->ax; @@ -1130,10 +1130,21 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->dx = pebs->dx; regs->si = pebs->si; regs->di = pebs->di; - regs->bp = pebs->bp; - regs->sp = pebs->sp; - regs->flags = pebs->flags; + /* + * Per the above; only set BP,SP if we don't need callchains. + * + * XXX: does this make sense? + */ + if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { + regs->bp = pebs->bp; + regs->sp = pebs->sp; + } + + /* + * Preserve PERF_EFLAGS_VM from set_linear_ip(). + */ + regs->flags = pebs->flags | (regs->flags & PERF_EFLAGS_VM); #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; regs->r9 = pebs->r9; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c5047b8f777b..1c1b9fe705c8 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -36,13 +36,6 @@ static DEFINE_PER_CPU(struct pt, pt_ctx); static struct pt_pmu pt_pmu; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - /* * Capabilities of Intel PT hardware, such as number of address bits or * supported output schemes, are cached and exported to userspace as "caps" @@ -64,21 +57,21 @@ static struct pt_cap_desc { u8 reg; u32 mask; } pt_caps[] = { - PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), - PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), - PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), - PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), - PT_CAP(mtc, 0, CR_EBX, BIT(3)), - PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), - PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), - PT_CAP(topa_output, 0, CR_ECX, BIT(0)), - PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), - PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), - PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), - PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), - PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), - PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), + PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), + PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), + PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) @@ -213,10 +206,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); + &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index efca2685d876..dbaaf7dc8373 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -319,9 +319,9 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, */ static int uncore_pmu_event_init(struct perf_event *event); -static bool is_uncore_event(struct perf_event *event) +static bool is_box_event(struct intel_uncore_box *box, struct perf_event *event) { - return event->pmu->event_init == uncore_pmu_event_init; + return &box->pmu->pmu == event->pmu; } static int @@ -340,7 +340,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, n = box->n_events; - if (is_uncore_event(leader)) { + if (is_box_event(box, leader)) { box->event_list[n] = leader; n++; } @@ -349,7 +349,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, return n; list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (!is_uncore_event(event) || + if (!is_box_event(box, event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 5f845eef9a4d..a3dcc12bef4a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -8,8 +8,12 @@ #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 -#define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f -#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x190c +#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904 +#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c +#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900 +#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 +#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f +#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -486,24 +490,12 @@ static int snb_uncore_imc_event_add(struct perf_event *event, int flags) snb_uncore_imc_event_start(event, 0); - box->n_events++; - return 0; } static void snb_uncore_imc_event_del(struct perf_event *event, int flags) { - struct intel_uncore_box *box = uncore_event_to_box(event); - int i; - snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < box->n_events; i++) { - if (event == box->event_list[i]) { - --box->n_events; - break; - } - } } int snb_pci2phy_map_init(int devid) @@ -616,13 +608,29 @@ static const struct pci_device_id bdw_uncore_pci_ids[] = { static const struct pci_device_id skl_uncore_pci_ids[] = { { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -666,8 +674,12 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ - IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ + IMC_DEV(SKL_Y_IMC, &skl_uncore_pci_driver), /* 6th Gen Core Y */ IMC_DEV(SKL_U_IMC, &skl_uncore_pci_driver), /* 6th Gen Core U */ + IMC_DEV(SKL_HD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Dual Core */ + IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */ + IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */ + IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */ { /* end marker */ } }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5874d8de1f8d..bcbb1d2ae10b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -113,7 +113,7 @@ struct debug_store { * Per register state. */ struct er_account { - raw_spinlock_t lock; /* per-core: protect structure */ + raw_spinlock_t lock; /* per-core: protect structure */ u64 config; /* extra MSR config */ u64 reg; /* extra MSR number */ atomic_t ref; /* reference count */ @@ -604,7 +604,7 @@ struct x86_pmu { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ - bool lbr_pt_coexist; /* LBR may coexist with PT */ + bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ /* * Intel PT/LBR/BTS are exclusive diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 2cfed174e3c9..2b892e2313a9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -6,10 +6,6 @@ generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h - generic-y += clkdev.h generic-y += cputime.h generic-y += dma-contiguous.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 5391b0ae7cc3..395b69551fce 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (amd_e400_c1e_detected) + else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E)) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 5e828da2e18f..00c88a01301d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -21,6 +21,10 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, unsigned long); +extern int amd_smn_read(u16 node, u32 address, u32 *value); +extern int amd_smn_write(u16 node, u32 address, u32 value); +extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); + struct amd_l3_cache { unsigned indices; u8 subcaches[4]; @@ -55,6 +59,7 @@ struct threshold_bank { }; struct amd_northbridge { + struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; @@ -66,7 +71,6 @@ struct amd_northbridge_info { u64 flags; struct amd_northbridge *nb; }; -extern struct amd_northbridge_info amd_northbridges; #define AMD_NB_GART BIT(0) #define AMD_NB_L3_INDEX_DISABLE BIT(1) @@ -74,20 +78,9 @@ extern struct amd_northbridge_info amd_northbridges; #ifdef CONFIG_AMD_NB -static inline u16 amd_nb_num(void) -{ - return amd_northbridges.num; -} - -static inline bool amd_nb_has_feature(unsigned feature) -{ - return ((amd_northbridges.flags & feature) == feature); -} - -static inline struct amd_northbridge *node_to_amd_nb(int node) -{ - return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; -} +u16 amd_nb_num(void); +bool amd_nb_has_feature(unsigned int feature); +struct amd_northbridge *node_to_amd_nb(int node); static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5aaf6c83222..0c5fbc68e82d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> -#include <asm/idle.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -196,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) static inline void native_apic_msr_eoi_write(u32 reg, u32 v) { - wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + wrmsr_notrace(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } static inline u32 native_apic_msr_read(u32 reg) @@ -332,6 +331,7 @@ struct apic { * on write for EOI. */ void (*eoi_write)(u32 reg, u32 v); + void (*native_eoi_write)(u32 reg, u32 v); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); @@ -639,7 +639,6 @@ extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); - exit_idle(); } static inline void entering_ack_irq(void) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..44b8762fa0c7 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include <asm/ftrace.h> +#include <asm/uaccess.h> +#include <asm/string.h> +#include <asm/page.h> +#include <asm/checksum.h> + +#include <asm-generic/asm-prototypes.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/special_insns.h> +#include <asm/preempt.h> + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 03d269bed941..24118c0b4640 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -272,7 +272,6 @@ struct compat_shmid64_ds { /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ -#ifdef CONFIG_X86_X32_ABI typedef struct user_regs_struct compat_elf_gregset_t; /* Full regset -- prstatus on x32, otherwise on ia32 */ @@ -281,10 +280,9 @@ typedef struct user_regs_struct compat_elf_gregset_t; do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ while (0) +#ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) -#else -typedef struct user_regs_struct32 compat_elf_gregset_t; #endif /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d2b69fc0ceb..d59c15c3defd 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -204,6 +204,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1188bc849ee3..6ccbf1aaa7ce 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -104,8 +104,8 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ @@ -193,7 +193,10 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -224,6 +227,7 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -277,8 +281,10 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ @@ -309,4 +315,6 @@ #define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 03bb1065c335..29e53ea7d764 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -5,8 +5,8 @@ #ifndef _CRYPTO_GLUE_HELPER_H #define _CRYPTO_GLUE_HELPER_H +#include <crypto/internal/skcipher.h> #include <linux/kernel.h> -#include <linux/crypto.h> #include <asm/fpu/api.h> #include <crypto/b128ops.h> @@ -69,6 +69,31 @@ static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, return true; } +static inline bool glue_skwalk_fpu_begin(unsigned int bsize, + int fpu_blocks_limit, + struct skcipher_walk *walk, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + /* prevent sleeping if FPU is in use */ + skcipher_walk_atomise(walk); + + kernel_fpu_begin(); + return true; +} + static inline void glue_fpu_end(bool fpu_enabled) { if (fpu_enabled) @@ -139,6 +164,18 @@ extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, common_glue_func_t tweak_fn, void *tweak_ctx, void *crypt_ctx); +extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + +extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn); diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 476b574de99e..ec23d8e1297c 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#ifdef CONFIG_EFI +/* + * E820_X_MAX is the maximum size of the extended E820 table. The extended + * table may contain up to 3 extra E820 entries per possible NUMA node, so we + * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. + * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. + */ #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) -#else /* ! CONFIG_EFI */ -#define E820_X_MAX E820MAX -#endif + #include <uapi/asm/e820.h> + #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map *e820; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 389d700b961e..e99675b9c861 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -210,12 +210,18 @@ static inline bool efi_is_64bit(void) return __efi_early()->is64; } +#define efi_table_attr(table, attr, instance) \ + (efi_is_64bit() ? \ + ((table##_64_t *)(unsigned long)instance)->attr : \ + ((table##_32_t *)(unsigned long)instance)->attr) + +#define efi_call_proto(protocol, f, instance, ...) \ + __efi_early()->call(efi_table_attr(protocol, f, instance), \ + instance, ##__VA_ARGS__) + #define efi_call_early(f, ...) \ - __efi_early()->call(efi_is_64bit() ? \ - ((efi_boot_services_64_t *)(unsigned long) \ - __efi_early()->boot_services)->f : \ - ((efi_boot_services_32_t *)(unsigned long) \ - __efi_early()->boot_services)->f, __VA_ARGS__) + __efi_early()->call(efi_table_attr(efi_boot_services, f, \ + __efi_early()->boot_services), __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 1c7eefe32502..7ec59edde154 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -229,18 +229,18 @@ static struct fd_routine_l { int (*_dma_setup)(char *addr, unsigned long size, int mode, int io); } fd_routine[] = { { - request_dma, - free_dma, - get_dma_residue, - dma_mem_alloc, - hard_dma_setup + ._request_dma = request_dma, + ._free_dma = free_dma, + ._get_dma_residue = get_dma_residue, + ._dma_mem_alloc = dma_mem_alloc, + ._dma_setup = hard_dma_setup }, { - vdma_request_dma, - vdma_nop, - vdma_get_dma_residue, - vdma_mem_alloc, - vdma_dma_setup + ._request_dma = vdma_request_dma, + ._free_dma = vdma_nop, + ._get_dma_residue = vdma_get_dma_residue, + ._dma_mem_alloc = vdma_mem_alloc, + ._dma_setup = vdma_dma_setup } }; diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 1429a7c736db..0877ae018fc9 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -27,16 +27,6 @@ extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); /* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -extern int irq_ts_save(void); -extern void irq_ts_restore(int TS_state); - -/* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2737366ea583..d4a684997497 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -60,11 +60,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has(X86_FEATURE_EAGER_FPU); -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -484,42 +479,42 @@ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size) DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) +static inline void __cpu_invalidate_fpregs_state(void) { - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - - -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { - if (!use_eager_fpu()) - clts(); + fpu->last_cpu = -1; } -static inline void __fpregs_deactivate_hw(void) +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - if (!use_eager_fpu()) - stts(); + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ -static inline void __fpregs_deactivate(struct fpu *fpu) +/* + * These generally need preemption protection to work, + * do try to avoid using these on their own: + */ +static inline void fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -528,8 +523,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) trace_x86_fpu_regs_deactivated(fpu); } -/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ -static inline void __fpregs_activate(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { WARN_ON_FPU(fpu->fpregs_active); @@ -554,51 +548,19 @@ static inline int fpregs_active(void) } /* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void fpregs_activate(struct fpu *fpu) -{ - __fpregs_activate_hw(); - __fpregs_activate(fpu); -} - -static inline void fpregs_deactivate(struct fpu *fpu) -{ - __fpregs_deactivate(fpu); - __fpregs_deactivate_hw(); -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. * * - switch_fpu_finish() restores the new state as * necessary. */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t -switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +static inline void +switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); - if (old_fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -608,29 +570,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new_fpu->counter++; - __fpregs_activate(new_fpu); - trace_x86_fpu_regs_activated(new_fpu); - prefetch(&new_fpu->state); - } else { - __fpregs_deactivate_hw(); - } - } else { - old_fpu->counter = 0; + } else old_fpu->last_cpu = -1; - if (fpu.preload) { - new_fpu->counter++; - if (fpu_want_lazy_restore(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(&new_fpu->state); - fpregs_activate(new_fpu); - } - } - return fpu; } /* @@ -638,15 +579,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) */ /* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. + * Set up the userspace FPU context for the new task, if the task + * has used the FPU. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (fpu_switch.preload) - copy_kernel_to_fpregs(&new_fpu->state); + bool preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active; + + if (preload) { + if (!fpregs_state_valid(new_fpu, cpu)) + copy_kernel_to_fpregs(&new_fpu->state); + fpregs_activate(new_fpu); + } } /* diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 48df486b02f9..3c80f5b9c09d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,17 +322,6 @@ struct fpu { unsigned char fpregs_active; /* - * @counter: - * - * This counter contains the number of consecutive context switches - * during which the FPU stays used. If this is over a threshold, the - * lazy FPU restore logic becomes eager, to save the trap overhead. - * This is an unsigned char so that after 256 iterations the counter - * wraps and the context switch behavior turns lazy again; this is to - * deal with bursty apps that only use the FPU for a short time: - */ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore @@ -340,29 +329,6 @@ struct fpu { * the registers in the FPU are more recent than this state * copy. If the task context-switches away then they get * saved here and represent the FPU state. - * - * After context switches there may be a (short) time period - * during which the in-FPU hardware registers are unchanged - * and still perfectly match this state, if the tasks - * scheduled afterwards are not using the FPU. - * - * This is the 'lazy restore' window of optimization, which - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. - * - * We detect whether a subsequent task uses the FPU via setting - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. - * - * During this window, if the task gets scheduled again, we - * might be able to skip having to do a restore from this - * memory buffer to the hardware registers - at the cost of - * incurring the overhead of #NM fault traps. - * - * Note that on modern CPUs that support the XSAVEOPT (or other - * optimized XSAVE instructions), we don't use #NM traps anymore, - * as the hardware can track whether FPU registers need saving - * or not. On such CPUs we activate the non-lazy ('eagerfpu') - * logic, which unconditionally saves/restores all FPU state - * across context switches. (if FPU state exists.) */ union fpregs_state state; /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 430bacf73074..1b2799e0699a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,21 +21,16 @@ /* Supervisor features */ #define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) -/* Supported features which support lazy state saving */ -#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ +/* All currently supported features */ +#define XCNTXT_MASK (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PKRU) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h deleted file mode 100644 index c5d1785373ed..000000000000 --- a/arch/x86/include/asm/idle.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_X86_IDLE_H -#define _ASM_X86_IDLE_H - -#define IDLE_START 1 -#define IDLE_END 2 - -struct notifier_block; -void idle_notifier_register(struct notifier_block *n); -void idle_notifier_unregister(struct notifier_block *n); - -#ifdef CONFIG_X86_64 -void enter_idle(void); -void exit_idle(void); -#else /* !CONFIG_X86_64 */ -static inline void enter_idle(void) { } -static inline void exit_idle(void) { } -static inline void __exit_idle(void) { } -#endif /* CONFIG_X86_64 */ - -void amd_e400_remove_cpu(int cpu); - -#endif /* _ASM_X86_IDLE_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 5b6753d1f7f4..49da9f497b90 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -17,6 +17,7 @@ extern int intel_mid_pci_init(void); extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); +extern pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev); extern void intel_mid_pwr_power_off(void); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index de25aad07853..d34bd370074b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -351,4 +351,10 @@ extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif +#ifdef CONFIG_X86_PAT +extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); +extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); +#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index d31881188431..29a594a3b82a 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -21,7 +21,6 @@ enum die_val { DIE_NMIUNKNOWN, }; -extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4b20f7304b9c..7892530cbacf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,6 +191,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +200,13 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -948,7 +957,6 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -958,8 +966,6 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); int (*check_intercept)(struct kvm_vcpu *vcpu, @@ -1065,6 +1071,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1127,7 +1134,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1206,7 +1214,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); @@ -1361,7 +1369,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..d74747b031ec 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node { * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. + * @node: this node */ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + int bytes, struct kvm_page_track_notifier_node *node); + /* + * It is called when memory slot is being moved or removed + * users can drop write-protection for the pages in that memory slot + * + * @kvm: the kvm where memory slot being moved or removed + * @slot: the memory slot being moved or removed + * @node: this node + */ + void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); }; void kvm_page_track_init(struct kvm *kvm); @@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ef01fef3eebc..6c119cfae218 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -9,7 +9,6 @@ #define LHCALL_FLUSH_TLB 5 #define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_SET_STACK 7 -#define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PMD 13 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9bd7ff5ffbcc..5132f2a6c0a2 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -252,8 +252,10 @@ static inline void cmci_recheck(void) {} #ifdef CONFIG_X86_MCE_AMD void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); #else static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif int mce_available(struct cpuinfo_x86 *c); @@ -293,9 +295,7 @@ void do_machine_check(struct pt_regs *, long); /* * Threshold handler */ - extern void (*mce_threshold_vector)(void); -extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Deferred error interrupt handler */ extern void (*deferred_error_int_vector)(void); @@ -356,27 +356,31 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) -#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) - -struct smca_hwid_mcatype { +struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ }; -struct smca_bank_info { - struct smca_hwid_mcatype *type; - u32 type_instance; +struct smca_bank { + struct smca_hwid *hwid; + /* Instance ID */ + u32 id; }; -extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; +extern struct smca_bank smca_banks[MAX_NR_BANKS]; + +extern const char *smca_get_long_name(enum smca_bank_types t); + +extern int mce_threshold_create_device(unsigned int cpu); +extern int mce_threshold_remove_device(unsigned int cpu); + +#else + +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; #endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index da0d81fa0b54..38711df3bcb5 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -20,6 +20,15 @@ do { \ (u32)((u64)(val)), \ (u32)((u64)(val) >> 32)) +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; + struct cpu_signature { unsigned int sig; unsigned int pf; @@ -55,12 +64,7 @@ struct ucode_cpu_info { void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; - -#ifdef CONFIG_MICROCODE -int __init microcode_init(void); -#else -static inline int __init microcode_init(void) { return 0; }; -#endif +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); @@ -131,11 +135,13 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE +int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else +static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 15eb75484cc0..3e3e20be829a 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -40,38 +40,18 @@ struct microcode_amd { unsigned int mpb[0]; }; -static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, - unsigned int sig) -{ - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; -} - -extern int __apply_microcode_amd(struct microcode_amd *mc_amd); -extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); - #define PATCH_MAX_SIZE PAGE_SIZE #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(void); -extern int __init save_microcode_in_initrd_amd(void); +extern void load_ucode_amd_ap(unsigned int family); +extern int __init save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(void); #else static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(void) {} -static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +static inline void load_ucode_amd_ap(unsigned int family) {} +static inline int __init +save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } void reload_ucode_amd(void) {} #endif diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 5e69154c9f07..195becc6f780 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,10 +52,6 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); -extern int microcode_sanity_check(void *mc, int print_err); -extern int find_matching_signature(void *mc, unsigned int csig, int cpf); - #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 72198c64e646..f9813b6d8b80 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -31,6 +31,10 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif +#ifdef CONFIG_X86_INTEL_MPX + /* address of the bounds directory */ + void __user *bd_addr; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8e0a9fe86de4..306c7e12af55 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -47,7 +47,7 @@ struct ldt_struct { * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; - int size; + unsigned int size; }; /* diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 7a35495275a9..0b416d4cf73b 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -59,7 +59,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { - return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); + return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR); } static inline void mpx_mm_init(struct mm_struct *mm) { @@ -67,7 +67,7 @@ static inline void mpx_mm_init(struct mm_struct *mm) * NULL is theoretically a valid place to put the bounds * directory, so point this at an invalid address. */ - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 78f3760ca1f2..710273c617b8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -37,6 +37,10 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ + +#define MSR_PPIN_CTL 0x0000004e +#define MSR_PPIN 0x0000004f + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index b5fee97813cd..db0b90c3b03e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -70,14 +70,14 @@ extern struct tracepoint __tracepoint_read_msr; extern struct tracepoint __tracepoint_write_msr; extern struct tracepoint __tracepoint_rdpmc; #define msr_tracepoint_active(t) static_key_false(&(t).key) -extern void do_trace_write_msr(unsigned msr, u64 val, int failed); -extern void do_trace_read_msr(unsigned msr, u64 val, int failed); -extern void do_trace_rdpmc(unsigned msr, u64 val, int failed); +extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else #define msr_tracepoint_active(t) false -static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {} +static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif static inline unsigned long long native_read_msr(unsigned int msr) @@ -115,22 +115,36 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, } /* Can be uninlined because referenced by paravirt */ -notrace static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +static inline void notrace +__native_write_msr_notrace(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +/* Can be uninlined because referenced by paravirt */ +static inline void notrace +native_write_msr(unsigned int msr, u32 low, u32 high) +{ + __native_write_msr_notrace(msr, low, high); if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } +static inline void +wrmsr_notrace(unsigned int msr, u32 low, u32 high) +{ + __native_write_msr_notrace(msr, low, high); +} + /* Can be uninlined because referenced by paravirt */ -notrace static inline int native_write_msr_safe(unsigned int msr, - unsigned low, unsigned high) +static inline int notrace +native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; + asm volatile("2: wrmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" @@ -223,7 +237,7 @@ do { \ (void)((high) = (u32)(__val >> 32)); \ } while (0) -static inline void wrmsr(unsigned msr, unsigned low, unsigned high) +static inline void wrmsr(unsigned int msr, u32 low, u32 high) { native_write_msr(msr, low, high); } @@ -231,13 +245,13 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -static inline void wrmsrl(unsigned msr, u64 val) +static inline void wrmsrl(unsigned int msr, u64 val) { native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ -static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) +static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) { return native_write_msr_safe(msr, low, high); } @@ -252,7 +266,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) __err; \ }) -static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) { int err; @@ -325,12 +339,12 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h deleted file mode 100644 index 7d3a48275394..000000000000 --- a/arch/x86/include/asm/mutex.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_X86_32 -# include <asm/mutex_32.h> -#else -# include <asm/mutex_64.h> -#endif diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h deleted file mode 100644 index e9355a84fc67..000000000000 --- a/arch/x86/include/asm/mutex_32.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_32_H -#define _ASM_X86_MUTEX_32_H - -#include <asm/alternative.h> - -/** - * __mutex_fastpath_lock - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * @fn: function to call if the original value was not 1 - * - * Change the count from 1 to a value lower than 1, and call <fn> if it - * wasn't 1 originally. This function MUST leave the value lower than 1 - * even when the "1" assertion wasn't true. - */ -#define __mutex_fastpath_lock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%eax)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1 - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 0 - * - * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>. - * In the failure case, this function is allowed to either set the value - * to 1, or to set it to a value lower than 1. - * - * If the implementation sets it to a value of lower than 1, the - * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs - * to return 0 otherwise. - */ -#define __mutex_fastpath_unlock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%eax)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to a value lower than 1, and return 0 (failure) - * if it wasn't 1 originally, or return 1 (success) otherwise. This function - * MUST leave the value lower than 1 even when the "1" assertion wasn't true. - * Additionally, if the value was < 0 originally, this function must not leave - * it to 0 on failure. - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h deleted file mode 100644 index d9850758464e..000000000000 --- a/arch/x86/include/asm/mutex_64.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_64_H -#define _ASM_X86_MUTEX_64_H - -/** - * __mutex_fastpath_lock - decrement and call function if negative - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is negative - * - * Atomically decrements @v and calls <fail_fn> if the result is negative. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_lock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " decl %0\n" - " jns %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_lock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - increment and call function if nonpositive - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is nonpositive - * - * Atomically increments @v and calls <fail_fn> if the result is nonpositive. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_unlock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " incl %0\n" - " jg %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_unlock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to 0 and return 1 (success), or return 0 (failure) - * if it wasn't 1 originally. [the fallback function is never used on - * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.] - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_64_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ce932812f142..1eea6ca40694 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -41,11 +41,6 @@ static inline void set_debugreg(unsigned long val, int reg) PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); } -static inline void clts(void) -{ - PVOP_VCALL0(pv_cpu_ops.clts); -} - static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); @@ -678,6 +673,11 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } +static __always_inline bool pv_vcpu_is_preempted(int cpu) +{ + return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); +} + #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0f400c0e4979..bb2de45a60f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -103,8 +103,6 @@ struct pv_cpu_ops { unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); - void (*clts)(void); - unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); @@ -310,6 +308,8 @@ struct pv_lock_ops { void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); + + struct paravirt_callee_save vcpu_is_preempted; }; /* This contains all the paravirt structures: we get a convenient @@ -508,6 +508,18 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)op) #endif +#define PVOP_RETMASK(rettype) \ + ({ unsigned long __mask = ~0UL; \ + switch (sizeof(rettype)) { \ + case 1: __mask = 0xffUL; break; \ + case 2: __mask = 0xffffUL; break; \ + case 4: __mask = 0xffffffffUL; break; \ + default: break; \ + } \ + __mask; \ + }) + + #define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ pre, post, ...) \ ({ \ @@ -535,7 +547,7 @@ int paravirt_disable_iospace(void); paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ + __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ } \ __ret; \ }) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 84f58de08c2b..9fa03604b2b3 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -507,17 +507,6 @@ do { \ #endif -/* This is not atomic against other CPUs -- CPU preemption needs to be off */ -#define x86_test_and_clear_bit_percpu(bit, var) \ -({ \ - bool old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\t" \ - CC_SET(c) \ - : CC_OUT(c) (old__), "+m" (var) \ - : "dIr" (bit)); \ - old__; \ -}) - static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..62b775926045 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -116,8 +116,7 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end, - int removed); +extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 17f218645701..ec1f3c651150 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -24,7 +24,13 @@ static __always_inline int preempt_count(void) static __always_inline void preempt_count_set(int pc) { - raw_cpu_write_4(__preempt_count, pc); + int old, new; + + do { + old = raw_cpu_read_4(__preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 984a7bf17f6a..6aa741fbe1df 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -137,6 +137,17 @@ struct cpuinfo_x86 { u32 microcode; }; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -178,6 +189,9 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); @@ -588,8 +602,6 @@ static __always_inline void cpu_relax(void) rep_nop(); } -#define cpu_relax_lowlatency() cpu_relax() - /* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { @@ -621,10 +633,9 @@ static inline void sync_core(void) } extern void select_idle_routine(const struct cpuinfo_x86 *c); -extern void init_amd_e400_c1e_mask(void); +extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; -extern bool amd_e400_c1e_detected; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index eaba08076030..c343ab52579f 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,12 @@ static inline void queued_spin_unlock(struct qspinlock *lock) { pv_queued_spin_unlock(lock); } + +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) +{ + return pv_vcpu_is_preempted(cpu); +} #else static inline void queued_spin_unlock(struct qspinlock *lock) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 19a2224f9e16..12af3e35edfa 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,11 +6,6 @@ #include <asm/nops.h> -static inline void native_clts(void) -{ - asm volatile("clts"); -} - /* * Volatile isn't enough to prevent the compiler from reordering the * read/write functions for the control registers and messing everything up. @@ -208,16 +203,8 @@ static inline void load_gs_index(unsigned selector) #endif -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - #endif/* CONFIG_PARAVIRT */ -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - static inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 37f2e0b377ad..a3269c897ec5 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -30,8 +30,7 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); -void stack_type_str(enum stack_type type, const char **begin, - const char **end); +const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { @@ -43,8 +42,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) addr + len > begin && addr + len <= end); } -extern int kstack_depth_to_print; - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else @@ -86,9 +83,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); - extern unsigned int code_bytes; /* The form of the top of the frame on the stack */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2aaca53c0974..ad6f5eb07a95 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,6 +52,15 @@ struct task_struct; #include <asm/cpufeature.h> #include <linux/atomic.h> +struct thread_info { + unsigned long flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} + #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cf75871d2f81..6358a85e2270 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -146,4 +146,36 @@ struct pci_bus; int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); +extern bool x86_topology_update; + +#ifdef CONFIG_SCHED_MC_PRIO +#include <asm/percpu.h> + +DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +extern unsigned int __read_mostly sysctl_sched_itmt_enabled; + +/* Interface to set priority of a cpu */ +void sched_set_itmt_core_prio(int prio, int core_cpu); + +/* Interface to notify scheduler that system supports ITMT */ +int sched_set_itmt_support(void); + +/* Interface to notify scheduler that system revokes ITMT support */ +void sched_clear_itmt_support(void); + +#else /* CONFIG_SCHED_MC_PRIO */ + +#define sysctl_sched_itmt_enabled 0 +static inline void sched_set_itmt_core_prio(int prio, int core_cpu) +{ +} +static inline int sched_set_itmt_support(void) +{ + return 0; +} +static inline void sched_clear_itmt_support(void) +{ +} +#endif /* CONFIG_SCHED_MC_PRIO */ + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2fbc66c7885b..2422b14c50a7 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_exceptions, diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 9217ab1f5bf6..342e59789fcd 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu, __field(struct fpu *, fpu) __field(bool, fpregs_active) __field(bool, fpstate_active) - __field(int, counter) __field(u64, xfeatures) __field(u64, xcomp_bv) ), @@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; - __entry->counter = fpu->counter; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->fpregs_active, __entry->fpstate_active, - __entry->counter, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 38a09a13a9bc..32dd6a9e343c 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_irq_vector, diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 33b6365c22fe..abb1fdcc545a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable; * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ +#ifdef CONFIG_X86_TSC +extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); +extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#else +static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } +static inline void tsc_verify_tsc_adjust(bool resume) { } +static inline void check_tsc_sync_source(int cpu) { } +static inline void check_tsc_sync_target(void) { } +#endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index faf3687f1035..ea148313570f 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +#else +# define WARN_ON_IN_IRQ() +#endif + /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ - likely(!__range_not_ok(addr, size, user_addr_max())) +#define access_ok(type, addr, size) \ +({ \ + WARN_ON_IN_IRQ(); \ + likely(!__range_not_ok(addr, size, user_addr_max())); \ +}) /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 46de9ac4b990..c5a7f3a930dd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -13,6 +13,7 @@ struct unwind_state { int graph_idx; #ifdef CONFIG_FRAME_POINTER unsigned long *bp; + struct pt_regs *regs; #else unsigned long *sp; #endif @@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) if (unwind_done(state)) return NULL; - return state->bp + 1; + return state->regs ? &state->regs->ip : state->bp + 1; +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs; } #else /* !CONFIG_FRAME_POINTER */ @@ -58,6 +67,11 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return NULL; +} + #endif /* CONFIG_FRAME_POINTER */ #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index e728699db774..3a01996db58f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void) * works on all CPUs. This is volatile so that it orders * correctly wrt barrier() and to keep gcc from cleverly * hoisting it out of the calling function. + * + * If RDPID is available, use it. */ - asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + alternative_io ("lsl %[p],%[seg]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); return p; } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07a7099..2b5b2d4b924e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -60,6 +61,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 #define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 @@ -110,6 +112,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -399,10 +431,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -419,8 +452,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c18ce67495fa..b10bf319ed20 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,6 +7,7 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 +#define SETUP_APPLE_PROPERTIES 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 94dc8ca434e0..1421a6585126 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -45,7 +45,9 @@ struct kvm_steal_time { __u64 steal; __u32 version; __u32 flags; - __u32 pad[12]; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; }; #define KVM_STEAL_ALIGNMENT_BITS 5 diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 69a6e07e3149..eb6247a7009b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -28,6 +28,7 @@ struct mce { __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index ae135de547f5..835aa51c7f6e 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -6,10 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 -#ifdef CONFIG_CHECKPOINT_RESTORE -# define ARCH_MAP_VDSO_X32 0x2001 -# define ARCH_MAP_VDSO_32 0x2002 -# define ARCH_MAP_VDSO_64 0x2003 -#endif +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..581386c7e429 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,7 @@ apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o -obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ @@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o ifdef CONFIG_FRAME_POINTER obj-y += unwind_frame.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c280df6b2aa2..ea3046e0b0cf 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) struct acpi_hest_ia_corrected *cmc; struct acpi_hest_ia_error_bank *mc_bank; - if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) - return 0; - cmc = (struct acpi_hest_ia_corrected *)hest_hdr; if (!cmc->enabled) return 0; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8a5abaa7d453..6f65b0eed384 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -76,6 +76,7 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug * Hotplug side: @@ -88,6 +89,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; * ->ioapic_lock */ static DEFINE_MUTEX(acpi_ioapic_lock); +#endif /* -------------------------------------------------------------------------- Boot-time Configuration @@ -454,6 +456,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here @@ -712,7 +715,7 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); } diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4fdf6230d93c..458da8509b75 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,8 +13,20 @@ #include <linux/spinlock.h> #include <asm/amd_nb.h> +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 + +/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +static DEFINE_MUTEX(smn_mutex); + static u32 *flush_words; +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; -EXPORT_SYMBOL(amd_nb_misc_ids); +EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, @@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { } }; -struct amd_northbridge_info amd_northbridges; -EXPORT_SYMBOL(amd_northbridges); +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); static struct pci_dev *next_northbridge(struct pci_dev *dev, const struct pci_device_id *ids) @@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, return dev; } +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} +EXPORT_SYMBOL_GPL(amd_df_indirect_read); + int amd_cache_northbridges(void) { u16 i = 0; struct amd_northbridge *nb; - struct pci_dev *misc, *link; + struct pci_dev *root, *misc, *link; - if (amd_nb_num()) + if (amd_northbridges.num) return 0; misc = NULL; @@ -74,15 +198,17 @@ int amd_cache_northbridges(void) if (!i) return -ENODEV; - nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; amd_northbridges.num = i; - link = misc = NULL; - for (i = 0; i != amd_nb_num(); i++) { + link = misc = root = NULL; + for (i = 0; i != amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, amd_root_ids); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = @@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res) { u32 address; u64 base, msr; - unsigned segn_busn_bits; + unsigned int segn_busn_bits; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return NULL; /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return NULL; address = MSR_FAM10H_MMIO_CONF_BASE; @@ -226,14 +352,14 @@ static void amd_cache_gart(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); if (!flush_words) { amd_northbridges.flags &= ~AMD_NB_GART; pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return; } - for (i = 0; i != amd_nb_num(); i++) + for (i = 0; i != amd_northbridges.num; i++) pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } @@ -246,18 +372,20 @@ void amd_flush_garts(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, flush_words[i] | 1); flushed++; } - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 88c657b057e2..5b7e43eff139 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -48,7 +48,6 @@ #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/idle.h> #include <asm/mtrr.h> #include <asm/time.h> #include <asm/smp.h> @@ -894,11 +893,13 @@ void __init setup_boot_APIC_clock(void) /* Setup the lapic or request the broadcast */ setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } void setup_secondary_APIC_clock(void) { setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } /* @@ -2159,21 +2160,6 @@ int __generic_processor_info(int apicid, int version, bool enabled) } /* - * This can happen on physical hotplug. The sanity check at boot time - * is done from native_smp_prepare_cpus() after num_possible_cpus() is - * established. - */ - if (topology_update_package_map(apicid, cpu) < 0) { - int thiscpu = max + disabled_cpus; - - pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENOSPC; - } - - /* * Validate version */ if (version == 0x0) { @@ -2263,6 +2249,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { /* Should happen once for each apic */ WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->native_eoi_write = (*drv)->eoi_write; (*drv)->eoi_write = eoi_write; } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 48e6d84f173e..945e512a112a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,7 +48,6 @@ #include <linux/bootmem.h> #include <asm/irqdomain.h> -#include <asm/idle.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/cpu.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index aeef53ce93e1..35690a168cf7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -815,9 +815,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) l = li; } addr1 = (base << shift) + - f * (unsigned long)(1 << m_io); + f * (1ULL << m_io); addr2 = (base << shift) + - (l + 1) * (unsigned long)(1 << m_io); + (l + 1) * (1ULL << m_io); pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c7364bd633e1..643818a7688b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ - cputime_t stime; + cputime_t stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { @@ -1042,8 +1042,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; - if (apm_bios_call(&call)) + if (apm_bios_call(&call)) { + if (!call.err) + return APM_NO_ERROR; return call.err; + } *status = call.ebx; *bat = call.ecx; if (apm_info.get_power_status_swabinminutes) { diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4a8697f7d4ef..33b63670bf09 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o +obj-y += bugs.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_X86_32) += bugs.o -obj-$(CONFIG_X86_64) += bugs_64.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b81fe2d63e15..71cae73a5076 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,10 @@ #include "cpu.h" +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c) smp_num_siblings = ((ebx >> 8) & 3) + 1; c->x86_max_cores /= smp_num_siblings; c->cpu_core_id = ebx & 0xff; + + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (cpuid_edx(0x80000006)) { + if (c->x86 == 0x17) { + /* + * LLC is at the core complex level. + * Core complex id is ApicId[3]. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } + } } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); node_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = node_id; } else return; @@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_AMD_DCM); cus_per_node = c->x86_max_cores / nodes_per_socket; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = node_id; - /* core id has to be in the [0 .. cores_per_node - 1] range */ c->cpu_core_id %= cus_per_node; } @@ -347,7 +367,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); - unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -357,18 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); - - /* - * Fix percpu cpu_llc_id here as LLC topology is different - * for Fam17h systems. - */ - if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) - return; - - socket_id = (c->apicid >> bits) - 1; - core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; - - per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } @@ -589,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); -} -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); +} static void init_amd_k8(struct cpuinfo_x86 *c) { @@ -774,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* 3DNow or LM implies PREFETCHW */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bd17db15a2c1..a44ef52184df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,15 +16,19 @@ #include <asm/msr.h> #include <asm/paravirt.h> #include <asm/alternative.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> void __init check_bugs(void) { identify_boot_cpu(); -#ifndef CONFIG_SMP - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + +#ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. * @@ -40,4 +44,18 @@ void __init check_bugs(void) alternative_instructions(); fpu__init_check_bugs(); +#else /* CONFIG_X86_64 */ + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +#endif } diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c deleted file mode 100644 index a972ac4c7e7d..000000000000 --- a/arch/x86/kernel/cpu/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> -#include <asm/cacheflush.h> - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bd910a7dd0a..1f6b50a449ab 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -979,6 +979,27 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c) } /* + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. + */ +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int apicid, cpu = smp_processor_id(); + + apicid = apic->cpu_present_to_apicid(cpu); + + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", + cpu, apicid, c->initial_apicid); + } + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); +#else + c->logical_proc_id = 0; +#endif +} + +/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1103,8 +1124,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif - /* The boot/hotplug time assigment got cleared, restore it */ - c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id); } /* @@ -1144,7 +1163,6 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1160,53 +1178,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) enable_sep_cpu(); #endif mtrr_ap_init(); + validate_apic_and_package_id(c); } -struct msr_range { - unsigned min; - unsigned max; -}; - -static const struct msr_range msr_range_array[] = { - { 0x00000000, 0x00000418}, - { 0xc0000000, 0xc000040b}, - { 0xc0010000, 0xc0010142}, - { 0xc0011000, 0xc001103b}, -}; - -static void __print_cpu_msr(void) -{ - unsigned index_min, index_max; - unsigned index; - u64 val; - int i; - - for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { - index_min = msr_range_array[i].min; - index_max = msr_range_array[i].max; - - for (index = index_min; index < index_max; index++) { - if (rdmsrl_safe(index, &val)) - continue; - pr_info(" MSR%08x: %016llx\n", index, val); - } - } -} - -static int show_msr; - -static __init int setup_show_msr(char *arg) -{ - int num; - - get_option(&arg, &num); - - if (num > 0) - show_msr = num; - return 1; -} -__setup("show_msr=", setup_show_msr); - static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); @@ -1240,14 +1214,6 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(", stepping: 0x%x)\n", c->x86_mask); else pr_cont(")\n"); - - print_cpu_msr(c); -} - -void print_cpu_msr(struct cpuinfo_x86 *c) -{ - if (c->cpu_index < show_msr) - __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) @@ -1462,11 +1428,8 @@ void cpu_init(void) */ cr4_init_shadow(); - /* - * Load microcode on this cpu if a valid microcode is available. - * This is early microcode loading procedure. - */ - load_ucode_ap(); + if (cpu) + load_ucode_ap(); t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..be6337156502 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 631356c8cca4..c7efbcfbeda6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e *msg = s->msg; s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (panic_on_oops || tolerant < 1) + if (tolerant < 1) return MCE_PANIC_SEVERITY; } return s->sev; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7fdf453d895..00ef43233e03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <linux/jump_label.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -135,6 +136,9 @@ void mce_setup(struct mce *m) m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + rdmsrl(MSR_PPIN, m->ppin); } DEFINE_PER_CPU(struct mce, injectm); @@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +static atomic_t num_notifiers; + void mce_register_decode_chain(struct notifier_block *nb) { + atomic_inc(&num_notifiers); + /* Ensure SRAO notifier has the highest priority in the decode chain. */ if (nb != &mce_srao_nb && nb->priority == INT_MAX) nb->priority -= 1; @@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { + atomic_dec(&num_notifiers); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = { .misc = misc_reg }; -static void print_mce(struct mce *m) +static void __print_mce(struct mce *m) { - int ret = 0; - - pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", - m->extcpu, m->mcgstatus, m->bank, m->status); + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); if (m->ip) { pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); @@ -308,6 +318,13 @@ static void print_mce(struct mce *m) pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); +} + +static void print_mce(struct mce *m) +{ + int ret = 0; + + __print_mce(m); /* * Print out human-readable details about the MCE error, @@ -499,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_gen_pool_empty() && keventd_up()) + if (!mce_gen_pool_empty()) schedule_work(&mce_work); } @@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = { .priority = INT_MAX, }; +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* + * Run the default notifier if we have only the SRAO + * notifier and us registered. + */ + if (atomic_read(&num_notifiers) > 2) + return NOTIFY_DONE; + + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = 0, +}; + /* * Read ADDR and MISC registers. */ @@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); + /* + * m.tsc was set in mce_setup(). Clear it if not requested. + * + * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid + * that dance. + */ + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; - /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) @@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) add_timer_on(t, cpu); } +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_pinned_timer(t, mce_timer_fn, cpu); +} + static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); @@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_init_timer(); + __mcheck_cpu_setup_timer(); } /* @@ -2138,6 +2193,7 @@ int __init mcheck_init(void) { mcheck_intel_therm_init(); mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_process_work); @@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); @@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + dev = kzalloc(sizeof *dev, GFP_KERNEL); if (!dev) return -ENOMEM; @@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void mce_disable_cpu(void) { - unsigned long action = *(unsigned long *)h; - if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_clear(); vendor_disable_error_reporting(); } -static void mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void) { - unsigned long action = *(unsigned long *)h; int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_reenable(); for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; @@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h) } } -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); + int ret; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - mce_device_create(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_device_remove(cpu); - mce_intel_hcpu_update(cpu); + mce_device_create(cpu); - /* intentionally ignoring frozen here */ - if (!(action & CPU_TASKS_FROZEN)) - cmci_rediscover(); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - del_timer_sync(t); - break; - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - mce_start_timer(cpu, t); - break; + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; } - - return NOTIFY_OK; + mce_reenable_cpu(); + mce_start_timer(cpu, t); + return 0; } -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = &per_cpu(mce_timer, cpu); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} static __init void mce_init_banks(void) { @@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { + enum cpuhp_state hp_online; int err; - int i = 0; if (!mce_available(&boot_cpu_data)) { err = -EIO; @@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = mce_device_create(i); - if (err) { - /* - * Register notifier anyway (and do not unreg it) so - * that we don't leave undeleted timers, see notifier - * callback above. - */ - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - goto err_device_create; - } - } + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + hp_online = err; register_syscore_ops(&mce_syscore_ops); @@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); + cpuhp_remove_state(hp_online); -err_device_create: - /* - * We didn't keep track of which devices were created above, but - * even if we had, the set of online cpus might have changed. - * Play safe and remove for every possible cpu, since - * mce_device_remove() will do the right thing. - */ - for_each_possible_cpu(i) - mce_device_remove(i); +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); err_out_mem: free_cpumask_var(mce_device_initialized); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9b5403462936..ffacfdcacb85 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -24,7 +24,6 @@ #include <asm/amd_nb.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -55,6 +54,8 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 +static bool thresholding_en; + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; -struct smca_bank_name smca_bank_names[] = { +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, @@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = { [SMCA_PSP] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(smca_bank_names); -static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { +const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ /* ZN Core (HWID=0xB0) MCA types */ @@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -struct smca_bank_info smca_banks[MAX_NR_BANKS]; +struct smca_bank smca_banks[MAX_NR_BANKS]; EXPORT_SYMBOL_GPL(smca_banks); /* @@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -/* - * CPU Initialization - */ - static void get_smca_bank_info(unsigned int bank) { unsigned int i, hwid_mcatype, cpu = smp_processor_id(); - struct smca_hwid_mcatype *type; - u32 high, instanceId; - u16 hwid, mcatype; + struct smca_hwid *s_hwid; + u32 high, instance_id; /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } - hwid = high & MCI_IPID_HWID; - mcatype = (high & MCI_IPID_MCATYPE) >> 16; - hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { - type = &smca_hwid_mcatypes[i]; - if (hwid_mcatype == type->hwid_mcatype) { - smca_banks[bank].type = type; - smca_banks[bank].type_instance = instanceId; + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + + WARN(smca_banks[bank].hwid, + "Bank %s already initialized!\n", + smca_get_name(s_hwid->bank_type)); + + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = instance_id; break; } } @@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + static void __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { @@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + struct thresh_restart tr; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void) log: __log_error(bank, false, true, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); } /* @@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].type) + if (!smca_banks[bank].hwid) return NULL; - bank_type = smca_banks[bank].type->bank_type; + bank_type = smca_banks[bank].hwid->bank_type; if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) @@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) } snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_bank_names[bank_type].name, - smca_banks[bank].type_instance); + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].id); return buf_mcatype; } @@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -/* create dir/files for all valid threshold banks */ -static int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - struct threshold_bank **bp; - int err = 0; - - bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, - GFP_KERNEL); - if (!bp) - return -ENOMEM; - - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - return err; - } - - return err; -} - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -1102,48 +1304,71 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static void threshold_remove_device(unsigned int cpu) +int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; + if (!thresholding_en) + return 0; + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } kfree(per_cpu(threshold_banks, cpu)); + per_cpu(threshold_banks, cpu) = NULL; + return 0; } -/* get notified when a cpu comes on/off */ -static void -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +/* create dir/files for all valid threshold banks */ +int mce_threshold_create_device(unsigned int cpu) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + if (!thresholding_en) + return 0; + + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; + + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto err; } + return err; +err: + mce_threshold_remove_device(cpu); + return err; } static __init int threshold_init_device(void) { unsigned lcpu = 0; + if (mce_threshold_vector == amd_threshold_interrupt) + thresholding_en = true; + /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); + int err = mce_threshold_create_device(lcpu); if (err) return err; } - threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1defb8ea882c..190b3e6cef4d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,6 +11,8 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void) * Reset the counter if we've logged an error in the last poll * during the storm. */ - if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); else this_cpu_dec(cmci_backoff_cnt); @@ -342,7 +344,7 @@ void cmci_recheck(void) return; local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); } @@ -464,11 +466,46 @@ static void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN available but disabled: */ + return; + } + + /* If PPIN is disabled, but not locked, try to enable: */ + if (!(val & 3UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + if ((val & 3UL) == 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); + intel_ppin_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6b9dc4d18ccc..465aca8be009 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -26,7 +26,6 @@ #include <asm/processor.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int thermal_throttle_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - int err = 0; - - dev = get_cpu_device(cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = thermal_throttle_add_dev(dev, cpu); - WARN_ON(err); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - thermal_throttle_remove_dev(dev); - break; - } - return notifier_from_errno(err); + struct device *dev = get_cpu_device(cpu); + + return thermal_throttle_add_dev(dev, cpu); } -static struct notifier_block thermal_throttle_cpu_notifier = +static int thermal_throttle_offline(unsigned int cpu) { - .notifier_call = thermal_throttle_cpu_callback, -}; + struct device *dev = get_cpu_device(cpu); + + thermal_throttle_remove_dev(dev); + return 0; +} static __init int thermal_throttle_init_device(void) { - unsigned int cpu = 0; - int err; + int ret; if (!atomic_read(&therm_throt_en)) return 0; - cpu_notifier_register_begin(); - - /* connect live CPUs to sysfs */ - for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); - WARN_ON(err); - } - - __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); - cpu_notifier_register_done(); - - return 0; + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; } device_initcall(thermal_throttle_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fcf9ae9384f4..9beb092d68a5 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -6,7 +6,6 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 220b1a508513..ba12e8aa4a45 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,4 +1,4 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 620ab06bcf45..6f353bdb3a25 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -5,6 +5,7 @@ * CPUs and later. * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2016 Borislav Petkov <bp@alien8.de> * * Author: Peter Oruba <peter.oruba@amd.com> * @@ -39,64 +40,25 @@ static struct equiv_cpu_entry *equiv_cpu_table; -struct ucode_patch { - struct list_head plist; - void *data; - u32 patch_id; - u16 equiv_cpu; -}; - -static LIST_HEAD(pcache); - /* * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. */ -static u8 *container; -static size_t container_size; -static bool ucode_builtin; +struct container { + u8 *data; + size_t size; +} cont; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; -static struct cpio_data ucode_cpio; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - char *path; - void *start; - size_t size; - - /* - * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/early-microcode.txt - */ - static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif /* !CONFIG_X86_32 */ - - return find_cpio_data(path, start, size, NULL); -#else - return (struct cpio_data){ NULL, 0, "" }; -#endif -} +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; static size_t compute_container_size(u8 *data, u32 total_size) { @@ -135,48 +97,48 @@ static size_t compute_container_size(u8 *data, u32 total_size) return size; } +static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, + unsigned int sig) +{ + int i = 0; + + if (!equiv_cpu_table) + return 0; + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + /* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +static struct container +find_proper_container(u8 *ucode, size_t size, u16 *ret_id) { + struct container ret = { NULL, 0 }; + u32 eax, ebx, ecx, edx; struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; - patch = &amd_ucode_patch; -#endif + u16 eq_id = 0; + u32 *header; + u8 *data; data = ucode; left = size; header = (u32 *)data; + /* find equiv cpu table */ if (header[0] != UCODE_MAGIC || header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ header[2] == 0) /* size */ - return; + return ret; eax = 0x00000001; ecx = 0; @@ -185,7 +147,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) while (left > 0) { eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - *cont = data; + ret.data = data; /* Advance past the container header */ offset = header[2] + CONTAINER_HDR_SZ; @@ -194,15 +156,15 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) eq_id = find_equiv_id(eq, eax); if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); + ret.size = compute_container_size(ret.data, left + offset); /* * truncate how much we need to iterate over in the * ucode update loop below */ - left = *cont_sz - offset; - break; + left = ret.size - offset; + *ret_id = eq_id; + return ret; } /* @@ -212,6 +174,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) */ while (left > 0) { header = (u32 *)data; + if (header[0] == UCODE_MAGIC && header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) break; @@ -226,14 +189,64 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) ucode = data; } - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } + return ret; +} + +static int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static struct container +apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) +{ + struct container ret = { NULL, 0 }; + u8 (*patch)[PATCH_MAX_SIZE]; + int offset, left; + u32 rev, *header; + u8 *data; + u16 eq_id = 0; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + patch = &amd_ucode_patch; +#endif if (check_current_patch_level(&rev, true)) - return; + return (struct container){ NULL, 0 }; + + ret = find_proper_container(ucode, size, &eq_id); + if (!eq_id) + return (struct container){ NULL, 0 }; + + this_equiv_id = eq_id; + header = (u32 *)ret.data; + + /* We're pointing to an equiv table, skip over it. */ + data = ret.data + header[2] + CONTAINER_HDR_SZ; + left = ret.size - (header[2] + CONTAINER_HDR_SZ); while (left > 0) { struct microcode_amd *mc; @@ -252,8 +265,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) *new_rev = rev; if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -261,10 +273,10 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) data += offset; left -= offset; } + return ret; } -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { #ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -281,47 +293,45 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { + struct ucode_cpu_info *uci; struct cpio_data cp; - bool *builtin; - void **data; - size_t *size; + const char *path; + bool use_pa; -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); - builtin = (bool *)__pa_nodebug(&ucode_builtin); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; - builtin = &ucode_builtin; -#endif + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } - *builtin = load_builtin_amd_microcode(&cp, family); - if (!*builtin) - cp = find_ucode_in_initrd(); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd(path, use_pa); if (!(cp.data && cp.size)) return; - *data = cp.data; - *size = cp.size; + /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_eax(1); - apply_ucode_in_initrd(cp.data, cp.size, true); + apply_microcode_early_amd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 /* * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory. + * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP. + * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, * which is used upon resume from suspend. */ -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { struct microcode_amd *mc; - size_t *usize; - void **ucode; + struct cpio_data cp; mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { @@ -329,60 +339,63 @@ void load_ucode_amd_ap(void) return; } - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true); - if (!*ucode || !*usize) + if (!(cp.data && cp.size)) return; - apply_ucode_in_initrd(*ucode, *usize, false); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + /* + * This would set amd_ucode_patch above so that the following APs can + * use it directly instead of going down this path again. + */ + apply_microcode_early_amd(cp.data, cp.size, true); } #else -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { - unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u8 *cont = container; u32 rev, eax; u16 eq_id; - /* Exit if called on the BSP. */ - if (!cpu) + /* 64-bit runs with paging enabled, thus early==false. */ + if (check_current_patch_level(&rev, false)) return; - if (!container) - return; + /* First AP hasn't cached it yet, go through the blob. */ + if (!cont.data) { + struct cpio_data cp = { NULL, 0, "" }; - /* - * 64-bit runs with paging enabled, thus early==false. - */ - if (check_current_patch_level(&rev, false)) - return; + if (cont.size == -1) + return; - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; +reget: + if (!get_builtin_microcode(&cp, family)) { +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); +#endif + if (!(cp.data && cp.size)) { + /* + * Mark it so that other APs do not scan again + * for no real reason and slow down boot + * needlessly. + */ + cont.size = -1; + return; + } + } + + cont = apply_microcode_early_amd(cp.data, cp.size, false); + if (!(cont.data && cont.size)) { + cont.size = -1; + return; + } + } eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -397,61 +410,50 @@ void load_ucode_amd_ap(void) } } else { - if (!ucode_cpio.data) - return; /* * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); + goto reget; } } -#endif +#endif /* CONFIG_X86_32 */ + +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(void) +int __init save_microcode_in_initrd_amd(unsigned int fam) { - unsigned long cont; - int retval = 0; enum ucode_state ret; - u8 *cont_va; - u32 eax; + int retval = 0; + u16 eq_id; - if (!container) - return -EINVAL; + if (!cont.data) { + if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) { + struct cpio_data cp = { NULL, 0, "" }; -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa(container); - cont_va = container; +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); #endif - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; + if (!(cp.data && cp.size)) { + cont.size = -1; + return -EINVAL; + } - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + cont = find_proper_container(cp.data, cp.size, &eq_id); + if (!eq_id) { + cont.size = -1; + return -EINVAL; + } - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + } else + return -EINVAL; + } - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size); if (ret != UCODE_OK) retval = -EINVAL; @@ -459,8 +461,8 @@ int __init save_microcode_in_initrd_amd(void) * This will be freed any msec now, stash patches for the current * family and switch to patch cache for cpu hotplug, etc later. */ - container = NULL; - container_size = 0; + cont.data = NULL; + cont.size = 0; return retval; } @@ -478,8 +480,10 @@ void reload_ucode_amd(void) return; mc = (struct microcode_amd *)amd_ucode_patch; + if (!mc) + return; - if (mc && rev < mc->hdr.patch_id) { + if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; pr_info("reload patch_level=0x%08x\n", ucode_new_rev); @@ -513,7 +517,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) + list_for_each_entry(p, µcode_cache, plist) if (p->equiv_cpu == equiv_cpu) return p; return NULL; @@ -523,7 +527,7 @@ static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) { + list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { if (p->patch_id >= new_patch->patch_id) /* we already have the latest patch */ @@ -536,14 +540,14 @@ static void update_cache(struct ucode_patch *new_patch) } } /* no patch found, add it */ - list_add_tail(&new_patch->plist, &pcache); + list_add_tail(&new_patch->plist, µcode_cache); } static void free_cache(void) { struct ucode_patch *p, *tmp; - list_for_each_entry_safe(p, tmp, &pcache, plist) { + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { __list_del(p->plist.prev, p->plist.next); kfree(p->data); kfree(p); @@ -663,21 +667,7 @@ bool check_current_patch_level(u32 *rev, bool early) return ret; } -int __apply_microcode_amd(struct microcode_amd *mc_amd) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) - return -1; - - return 0; -} - -int apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -860,7 +850,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5ce5155f0695..6996413c78c3 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * 2006 Shaohua Li <shaohua.li@intel.com> - * 2013-2015 Borislav Petkov <bp@alien8.de> + * 2013-2016 Borislav Petkov <bp@alien8.de> * * X86 CPU microcode early update for Linux: * @@ -39,12 +39,15 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/setup.h> -#define MICROCODE_VERSION "2.01" +#define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr; +LIST_HEAD(microcode_cache); + /* * Synchronization. * @@ -167,7 +170,7 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - load_ucode_amd_ap(); + load_ucode_amd_ap(family); break; default: break; @@ -185,7 +188,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(c->x86); break; default: break; @@ -194,6 +197,58 @@ static int __init save_microcode_in_initrd(void) return -EINVAL; } +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Did we relocate the ramdisk? + * + * So we possibly relocate the ramdisk *after* applying microcode on the + * BSP so we rely on use_pa (use physical addresses) - even if it is not + * absolutely correct - to determine whether we've done the ramdisk + * relocation already. + */ + if (!use_pa && relocated_ramdisk) + start = initrd_start; + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif +} + void reload_early_microcode(void) { int vendor, family; @@ -453,16 +508,17 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - microcode_ops->microcode_fini_cpu(cpu); + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); } static enum ucode_state microcode_resume_cpu(int cpu) { - pr_debug("CPU%d updated upon resume\n", cpu); - if (apply_microcode_on_target(cpu)) return UCODE_ERROR; + pr_debug("CPU%d updated upon resume\n", cpu); + return UCODE_OK; } @@ -496,6 +552,9 @@ static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + /* Refresh CPU microcode revision after resume. */ + collect_cpu_info(cpu); + if (uci->valid) return microcode_resume_cpu(cpu); @@ -579,12 +638,7 @@ static int mc_cpu_down_prep(unsigned int cpu) /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); - /* - * When a CPU goes offline, don't free up or invalidate the copy of - * the microcode in kernel memory, so that we can reuse it when the - * CPU comes back online without unnecessarily requesting the userspace - * for it again. - */ + return 0; } @@ -649,8 +703,7 @@ int __init microcode_init(void) cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); return 0; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0deab00c9..54d50c3694d8 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,125 +39,83 @@ #include <asm/setup.h> #include <asm/msr.h> -/* - * Temporary microcode blobs pointers storage. We note here during early load - * the pointers to microcode blobs we've got from whatever storage (detached - * initrd, builtin). Later on, we put those into final storage - * mc_saved_data.mc_saved. - * - * Important: those are offsets from the beginning of initrd or absolute - * addresses within the kernel image when built-in. - */ -static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; - -static struct mc_saved_data { - unsigned int num_saved; - struct microcode_intel **mc_saved; -} mc_saved_data; +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Microcode blobs within the initrd. 0 if builtin. */ -static struct ucode_blobs { - unsigned long start; - bool valid; -} blobs; +/* Current microcode patch used in early patching */ +struct microcode_intel *intel_ucode_patch; -/* Go through saved patches and find the one suitable for the current CPU. */ -static enum ucode_state -find_microcode_patch(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } + if (s1 != s2) + return false; - if (!new_mc) - return UCODE_NFOUND; + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; + /* ... or they intersect. */ + return p1 & p2; } -static inline void -copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, - unsigned long off, int num_saved) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int find_matching_signature(void *mc, unsigned int csig, int cpf) { + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; int i; - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) -{ - int i; - struct microcode_intel ***mc_saved; + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; - mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; - for (i = 0; i < mcs->num_saved; i++) { - struct microcode_intel *p; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; } + return 0; } -#endif -static enum ucode_state -load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long offset, struct ucode_cpu_info *uci) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mcs->num_saved; + struct microcode_header_intel *mc_hdr = mc; - if (!mcs->mc_saved) { - copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); + if (mc_hdr->rev <= new_rev) + return 0; - return find_microcode_patch(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mcs); - return find_microcode_patch(mc_saved_tmp, count, uci); -#else - return find_microcode_patch(mcs->mc_saved, count, uci); -#endif - } + return find_matching_signature(mc, csig, cpf); } /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. + * + * %true - if there's a match + * %false - otherwise */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) +static bool microcode_matches(struct microcode_header_intel *mc_header, + unsigned long sig) { - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; + struct extended_sigtable *ext_header; + unsigned int fam_ucode, model_ucode; struct extended_signature *ext_sig; + unsigned int fam, model; + int ext_sigcount, i; fam = x86_family(sig); model = x86_model(sig); @@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(mc_header->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; + return false; ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(ext_sig->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; ext_sig++; } - return UCODE_NFOUND; + return false; } -static int -save_microcode(struct mc_saved_data *mcs, - struct microcode_intel **mc_saved_src, - unsigned int num_saved) +static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { - int i, j; - struct microcode_intel **saved_ptr; - int ret; + struct ucode_patch *p; - if (!num_saved) - return -EINVAL; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < num_saved; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return ERR_PTR(-ENOMEM); + } - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); + return p; +} - saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; +static void save_microcode_patch(void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (find_matching_signature(data, sig, pf)) { + prev_found = true; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer %p\n", data); + else + list_replace(&iter->plist, &p->plist); } } /* - * Point to newly saved microcode. + * There weren't any previous patches found in the list cache; save the + * newly found. */ - mcs->mc_saved = saved_ptr; - mcs->num_saved = num_saved; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; + if (!prev_found) { + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } } -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) +static int microcode_sanity_check(void *mc, int print_err) { - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format.\n"); + return -EINVAL; + } - found = 1; + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; /* - * Found an older ucode saved earlier. Replace it with - * this newer one. + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; } - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + if (!ext_table_size) + return 0; - return num_saved; + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; } /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. */ -static enum ucode_state __init -get_matching_model_microcode(unsigned long start, void *data, size_t size, - struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci) +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; struct microcode_header_intel *mc_header; - unsigned int num_saved = mcs->num_saved; - enum ucode_state state = UCODE_OK; - unsigned int leftover = size; - u8 *ucode_ptr = data; + struct microcode_intel *patch = NULL; unsigned int mc_size; - int i; - - while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { - if (leftover < sizeof(mc_header)) + while (size) { + if (size < sizeof(struct microcode_header_intel)) break; - mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_header = (struct microcode_header_intel *)data; mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) + if (!mc_size || + mc_size > size || + microcode_sanity_check(data, 0) < 0) break; - leftover -= mc_size; + size -= mc_size; - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { - ucode_ptr += mc_size; + if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + data += mc_size; continue; } - num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); + if (save) { + save_microcode_patch(data, mc_size); + goto next; + } - ucode_ptr += mc_size; - } - if (leftover) { - state = UCODE_ERROR; - return state; - } + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; - if (!num_saved) { - state = UCODE_NFOUND; - return state; - } + } else { + struct microcode_header_intel *phdr = &patch->hdr; - for (i = 0; i < num_saved; i++) - mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } + + /* We have a newer patch, save it. */ + patch = data; - mcs->num_saved = num_saved; +next: + data += mc_size; + } - return state; + if (size) + return NULL; + + return patch; } static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; unsigned int family, model; - struct cpu_signature csig; + struct cpu_signature csig = { 0 }; unsigned int eax, ebx, ecx, edx; - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - memset(uci, 0, sizeof(*uci)); eax = 0x00000001; @@ -374,8 +382,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - family = x86_family(csig.sig); - model = x86_model(csig.sig); + family = x86_family(eax); + model = x86_model(eax); if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ @@ -401,40 +409,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) static void show_saved_mc(void) { #ifdef DEBUG - int i, j; + int i = 0, j; unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; + struct ucode_patch *p; - if (!mc_saved_data.num_saved) { + if (list_empty(µcode_cache)) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.num_saved; i++) { + list_for_each_entry(p, µcode_cache, plist) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; - int ext_sigcount; struct extended_signature *ext_sig; + int ext_sigcount; + + mc_saved_header = (struct microcode_header_intel *)p->data; - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + date = mc_saved_header->date; + + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, + i++, sig, pf, rev, total_size, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -443,7 +452,7 @@ static void show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -456,85 +465,43 @@ static void show_saved_mc(void) ext_sig++; } - } #endif } /* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. + * Save this microcode patch. It will be loaded early when a CPU is + * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc, unsigned int size) { #ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int num_saved; - struct microcode_intel **mc_saved; - int ret, i; - mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.num_saved; - num_saved = mc_saved_data.num_saved; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && num_saved) - memcpy(mc_saved_tmp, mc_saved, - num_saved * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - num_saved = _save_mc(mc_saved_tmp, mc, num_saved); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - + save_microcode_patch(mc, size); show_saved_mc(); - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: mutex_unlock(&x86_cpu_microcode_mutex); #endif } -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +static bool load_builtin_intel_microcode(struct cpio_data *cp) { -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int eax = 1, ebx, ecx = 0, edx; char name[30]; + if (IS_ENABLED(CONFIG_X86_32)) + return false; + native_cpuid(&eax, &ebx, &ecx, &edx); sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); return get_builtin_firmware(cp, name); -#else - return false; -#endif } /* @@ -570,8 +537,7 @@ void show_ucode_info_early(void) } /* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in + * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ static void print_ucode(struct ucode_cpu_info *uci) @@ -648,206 +614,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return 0; } -/* - * This function converts microcode patch offsets previously stored in - * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. - */ int __init save_microcode_in_initrd_intel(void) { - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data.num_saved; - unsigned long offset = 0; - int ret; - - if (!count) - return 0; + struct ucode_cpu_info uci; + struct cpio_data cp; /* - * We have found a valid initrd but it might've been relocated in the - * meantime so get its updated address. + * AP loading didn't find any microcode patch, no need to save anything. */ - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) - offset = initrd_start; - - copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); + if (!intel_ucode_patch || IS_ERR(intel_ucode_patch)) + return 0; - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - else - show_saved_mc(); + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); - return ret; -} + if (!(cp.data && cp.size)) + return 0; -static __init enum ucode_state -__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) -{ -#ifdef CONFIG_BLK_DEV_INITRD - static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; - char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) - : ucode_name; -# ifdef CONFIG_X86_32 - unsigned long start = 0, size; - struct boot_params *params; + collect_cpu_info_early(&uci); - params = (struct boot_params *)__pa_nodebug(&boot_params); - size = params->hdr.ramdisk_size; + scan_microcode(cp.data, cp.size, &uci, true); - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? params->hdr.ramdisk_image : 0); + show_saved_mc(); -# else /* CONFIG_X86_64 */ - unsigned long start = 0, size; + return 0; +} - size = (u64)boot_params.ext_ramdisk_size << 32; - size |= boot_params.hdr.ramdisk_size; - if (size) { - start = (u64)boot_params.ext_ramdisk_image << 32; - start |= boot_params.hdr.ramdisk_image; +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +{ + static const char *path; + struct cpio_data cp; + bool use_pa; - start += PAGE_OFFSET; + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; } -# endif - - *cd = find_cpio_data(p, (void *)start, size, NULL); - if (cd->data) { - blbp->start = start; - blbp->valid = true; - return UCODE_OK; - } else -#endif /* CONFIG_BLK_DEV_INITRD */ - return UCODE_ERROR; -} + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci, struct ucode_blobs *blbp) -{ - struct cpio_data cd = { NULL, 0, "" }; - enum ucode_state ret; + if (!(cp.data && cp.size)) + return NULL; - /* try built-in microcode first */ - if (load_builtin_intel_microcode(&cd)) - /* - * Invalidate blobs as we might've gotten an initrd too, - * supplied by the boot loader, by mistake or simply forgotten - * there. That's fine, we ignore it since we've found builtin - * microcode already. - */ - blbp->valid = false; - else { - ret = __scan_microcode_initrd(&cd, blbp); - if (ret != UCODE_OK) - return ret; - } + collect_cpu_info_early(uci); - return get_matching_model_microcode(blbp->start, cd.data, cd.size, - mcs, mc_ptrs, uci); + return scan_microcode(cp.data, cp.size, uci, false); } -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_blobs *blbp) +void __init load_ucode_intel_bsp(void) { + struct microcode_intel *patch; struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); - if (ret != UCODE_OK) + patch = __load_ucode_intel(&uci); + if (!patch) return; - ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); - if (ret != UCODE_OK) - return; + uci.mc = patch; apply_microcode_early(&uci, true); } -void __init load_ucode_intel_bsp(void) +void load_ucode_intel_ap(void) { - struct ucode_blobs *blobs_p; - struct mc_saved_data *mcs; - unsigned long *ptrs; + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + +reget: + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } - _load_ucode_intel_bsp(mcs, ptrs, blobs_p); + uci.mc = *iup; + + if (apply_microcode_early(&uci, true)) { + /* Mixed-silicon system? Try to refetch the proper patch: */ + *iup = NULL; + + goto reget; + } } -void load_ucode_intel_ap(void) +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) { - struct ucode_blobs *blobs_p; - unsigned long *ptrs, start = 0; - struct mc_saved_data *mcs; - struct ucode_cpu_info uci; - enum ucode_state ret; + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (!mcs->num_saved) - return; + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - if (blobs_p->valid) { - start = blobs_p->start; + phdr = (struct microcode_header_intel *)iter->data; - /* - * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles - * physmem mapping too and there we have the initrd. - */ - start += PAGE_OFFSET - __PAGE_OFFSET_BASE; - } + if (phdr->rev <= uci->cpu_sig.rev) + continue; - collect_cpu_info_early(&uci); - ret = load_microcode(mcs, ptrs, start, &uci); - if (ret != UCODE_OK) - return; + if (!find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; - apply_microcode_early(&uci, true); + return iter->data; + } + return NULL; } void reload_ucode_intel(void) { + struct microcode_intel *p; struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.num_saved) - return; collect_cpu_info_early(&uci); - ret = find_microcode_patch(mc_saved_data.mc_saved, - mc_saved_data.num_saved, &uci); - if (ret != UCODE_OK) + p = find_patch(&uci); + if (!p) return; + uci.mc = p; + apply_microcode_early(&uci, false); } @@ -879,24 +779,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -/* - * return 0 - no update found - * return 1 - found update - */ -static int get_matching_mc(struct microcode_intel *mc, int cpu) -{ - struct cpu_signature cpu_sig; - unsigned int csig, cpf, crev; - - collect_cpu_info(cpu, &cpu_sig); - - csig = cpu_sig.sig; - cpf = cpu_sig.pf; - crev = cpu_sig.rev; - - return has_newer_microcode(mc, csig, cpf, crev); -} - static int apply_microcode_intel(int cpu) { struct microcode_intel *mc; @@ -911,16 +793,12 @@ static int apply_microcode_intel(int cpu) uci = ucode_cpu_info + cpu; mc = uci->mc; - if (!mc) - return 0; - - /* - * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc when it is newer than the one on this - * CPU. - */ - if (!get_matching_mc(mc, cpu)) - return 0; + if (!mc) { + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) + return 0; + } /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -962,7 +840,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - enum ucode_state state = UCODE_OK; unsigned int curr_mc_size = 0; unsigned int csig, cpf; @@ -1015,14 +892,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (leftover) { vfree(new_mc); - state = UCODE_ERROR; - goto out; + return UCODE_ERROR; } - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } + if (!new_mc) + return UCODE_NFOUND; vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; @@ -1032,12 +906,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc); + save_mc_for_early(new_mc, curr_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); -out: - return state; + + return UCODE_OK; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -1081,20 +955,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size) return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - vfree(uci->mc); - uci->mc = NULL; -} - static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, - .microcode_fini_cpu = microcode_fini_cpu, }; struct microcode_ops * __init init_intel_microcode(void) @@ -1109,4 +974,3 @@ struct microcode_ops * __init init_intel_microcode(void) return µcode_intel_ops; } - diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c deleted file mode 100644 index 406cb6c0d9dd..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ -#include <linux/firmware.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include <asm/microcode_intel.h> -#include <asm/processor.h> -#include <asm/msr.h> - -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - -int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) -{ - struct microcode_header_intel *mc_hdr = mc; - - if (mc_hdr->rev <= new_rev) - return 0; - - return find_matching_signature(mc, csig, cpf); -} diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f44c5a50ab8..6c044543545e 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -25,7 +25,6 @@ #include <asm/hyperv.h> #include <asm/mshyperv.h> #include <asm/desc.h> -#include <asm/idle.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 8cb57df9398d..d1316f9c8329 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,11 +17,17 @@ struct cpuid_bit { u32 sub_leaf; }; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX +/* Please keep the leaf sorted by cpuid_bit.level for faster search. */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { 0, 0, 0, 0, 0 } }; void init_scattered_cpuid_features(struct cpuinfo_x86 *c) @@ -30,16 +36,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, - { 0, 0, 0, 0, 0 } - }; - for (cb = cpuid_bits; cb->feature; cb++) { /* Verify that the level is valid */ @@ -48,10 +44,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); } } + +u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, + enum cpuid_regs_idx reg) +{ + const struct cpuid_bit *cb; + u32 cpuid_val = 0; + + for (cb = cpuid_bits; cb->feature; cb++) { + + if (level > cb->level) + continue; + + if (level < cb->level) + break; + + if (reg == cb->reg && sub_leaf == cb->sub_leaf) { + if (cpu_has(&boot_cpu_data, cb->feature)) + cpuid_val |= BIT(cb->bit); + } + } + + return cpuid_val; +} +EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 81160578b91a..891f4dad7b2c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,10 +24,16 @@ #include <linux/dmi.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/clocksource.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> +#include <asm/timer.h> #include <asm/apic.h> +#include <asm/timer.h> + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,6 +53,8 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +static unsigned long vmware_tsc_khz __ro_after_init; + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -56,35 +64,80 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz, lpj; - uint32_t eax, ebx, ecx, edx; + return vmware_tsc_khz; +} - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static int vmw_sched_clock __initdata = 1; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", - (unsigned long) tsc_hz / 1000, - (unsigned long) tsc_hz % 1000); - - if (!preset_lpj) { - lpj = ((u64)tsc_hz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; - } +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; - return tsc_hz; + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; } +static void __init vmware_sched_clock_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); +} + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) + vmware_sched_clock_setup(); +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ lapic_timer_frequency = ecx / HZ; @@ -94,6 +147,12 @@ static void __init vmware_platform_setup(void) } else { pr_warn("Failed to get TSC freq from the hypervisor\n"); } + + vmware_paravirt_ops_setup(); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } /* diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2836de390f95..0931a105ffe1 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -45,10 +45,7 @@ #include <asm/msr.h> static struct class *cpuid_class; - -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; +static enum cpuhp_state cpuhp_cpuid_state; static void cpuid_smp_cpuid(void *cmd_block) { @@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_device_create(int cpu) +static int cpuid_device_create(unsigned int cpu) { struct device *dev; @@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void cpuid_device_destroy(int cpu) +static int cpuid_device_destroy(unsigned int cpu) { device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; } -static int cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = cpuid_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - cpuid_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block cpuid_class_cpu_notifier = -{ - .notifier_call = cpuid_class_cpu_callback, -}; - static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); @@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) static int __init cpuid_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } cpuid_class = class_create(THIS_MODULE, "cpuid"); if (IS_ERR(cpuid_class)) { @@ -177,45 +149,28 @@ static int __init cpuid_init(void) } cpuid_class->devnode = cpuid_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = cpuid_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; - err = 0; - goto out; + cpuhp_cpuid_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) { - cpuid_device_destroy(i); - } - cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); -out: return err; } +module_init(cpuid_init); static void __exit cpuid_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - cpuid_device_destroy(cpu); + cpuhp_remove_state(cpuhp_cpuid_state); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(cpuid_init); module_exit(cpuid_exit); MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 650830e39e3a..3741461c63a0 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg) int crash_load_segments(struct kimage *image) { - unsigned long src_start, src_sz, elf_sz; - void *elf_addr; int ret; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; /* * Determine and load a segment for backup area. First 640K RAM @@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image) if (ret < 0) return ret; - src_start = image->arch.backup_src_start; - src_sz = image->arch.backup_src_sz; - /* Add backup segment. */ - if (src_sz) { + if (image->arch.backup_src_sz) { + kbuf.buffer = &crash_zero_bytes; + kbuf.bufsz = sizeof(crash_zero_bytes); + kbuf.memsz = image->arch.backup_src_sz; + kbuf.buf_align = PAGE_SIZE; /* * Ideally there is no source for backup segment. This is * copied in purgatory after crash. Just add a zero filled * segment for now to make sure checksum logic works fine. */ - ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, - sizeof(crash_zero_bytes), src_sz, - PAGE_SIZE, 0, -1, 0, - &image->arch.backup_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) return ret; + image->arch.backup_load_addr = kbuf.mem; pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, src_start, src_sz); + image->arch.backup_load_addr, + image->arch.backup_src_start, kbuf.memsz); } /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) return ret; - image->arch.elf_headers = elf_addr; - image->arch.elf_headers_sz = elf_sz; + image->arch.elf_headers = kbuf.buffer; + image->arch.elf_headers_sz = kbuf.bufsz; - ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, - ELF_CORE_HEADER_ALIGN, 0, -1, 0, - &image->arch.elf_load_addr); + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + ret = kexec_add_buffer(&kbuf); if (ret) { vfree((void *)image->arch.elf_headers); return ret; } + image->arch.elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, elf_sz, elf_sz); + image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9b7cf5c28f5f..0cfd01d2754c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,7 +22,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -46,14 +45,7 @@ static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { touch_nmi_watchdog(); - printk("%s [<%p>] %s%pB\n", - log_lvl, (void *)address, reliable ? "" : "? ", - (void *)address); -} - -void printk_address(unsigned long address) -{ - pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -67,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); /* * Iterate through the stacks, starting with the current stack pointer. @@ -82,8 +75,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (; stack; stack = stack_info.next_sp) { - const char *str_begin, *str_end; + for (regs = NULL; stack; stack = stack_info.next_sp) { + const char *stack_name; /* * If we overflowed the task stack into a guard page, jump back @@ -95,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - stack_type_str(stack_info.type, &str_begin, &str_end); - if (str_begin) - printk("%s <%s> ", log_lvl, str_begin); + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); /* * Scan the stack, printing any text addresses we find. At the @@ -112,13 +105,22 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, for (; stack < stack_info.end; stack++) { unsigned long real_addr; int reliable = 0; - unsigned long addr = *stack; + unsigned long addr = READ_ONCE_NOCHECK(*stack); unsigned long *ret_addr_p = unwind_get_return_address_ptr(&state); if (!__kernel_text_address(addr)) continue; + /* + * Don't print regs->ip again if it was already printed + * by __show_regs() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); + continue; + } + if (stack == ret_addr_p) reliable = 1; @@ -146,10 +148,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * of the addresses will just be printed as unreliable. */ unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) + __show_regs(regs, 0); } - if (str_end) - printk("%s <%s> ", log_lvl, str_end); + if (stack_name) + printk("%s </%s>\n", log_lvl, stack_name); } } @@ -164,12 +171,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, ""); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, ""); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -261,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err) sp = kernel_stack_pointer(regs); savesegment(ss, ss); } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", + (void *)regs->ip, ss, sp); #else /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip); - printk(" RSP <%016lx>\n", regs->sp); + printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); #endif return 0; } @@ -291,22 +295,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..bb3b5b9a6899 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,18 +16,15 @@ #include <asm/stacktrace.h> -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { - switch (type) { - case STACK_TYPE_IRQ: - case STACK_TYPE_SOFTIRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + return NULL; } static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) @@ -109,8 +106,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -121,36 +120,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +137,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..fac189efcc34 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - switch (type) { - case STACK_TYPE_IRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; - *end = "EOE"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; } static bool in_exception_stack(unsigned long *stack, struct stack_info *info) @@ -128,8 +122,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -140,56 +136,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" <EOI> "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +153,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index aad34aafc0e0..d913047f832c 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -23,17 +23,12 @@ static double __initdata y = 3145727.0; */ void __init fpu__init_check_bugs(void) { - u32 cr0_saved; s32 fdiv_bug; /* kernel_fpu_begin/end() relies on patched alternative instructions. */ if (!boot_cpu_has(X86_FEATURE_FPU)) return; - /* We might have CR0::TS set already, clear it: */ - cr0_saved = read_cr0(); - write_cr0(cr0_saved & ~X86_CR0_TS); - kernel_fpu_begin(); /* @@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void) kernel_fpu_end(); - write_cr0(cr0_saved); - if (fdiv_bug) { set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); pr_warn("Hmm, FPU with FDIV bug\n"); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 47004010ad5d..e4e97a5355ce 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void) return this_cpu_read(in_kernel_fpu); } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ static bool interrupted_kernel_fpu_idle(void) { - if (kernel_fpu_disabled()) - return false; - - if (use_eager_fpu()) - return true; - - return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); + return !kernel_fpu_disabled(); } /* @@ -125,8 +107,7 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); + __cpu_invalidate_fpregs_state(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -137,8 +118,6 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); kernel_fpu_enable(); } @@ -159,35 +138,6 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * CR0::TS save/restore functions: - */ -int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(irq_ts_save); - -void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} -EXPORT_SYMBOL_GPL(irq_ts_restore); - -/* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. @@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { - if (use_eager_fpu()) - copy_kernel_to_fpregs(&fpu->state); - else - fpregs_deactivate(fpu); + copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); @@ -247,7 +194,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -260,8 +206,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ - if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -283,10 +228,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - if (use_eager_fpu()) - copy_kernel_to_fpregs(&src_fpu->state); - else - fpregs_deactivate(src_fpu); + copy_kernel_to_fpregs(&src_fpu->state); } preempt_enable(); @@ -366,7 +308,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) if (fpu->fpstate_active) { /* Invalidate any lazy state: */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); @@ -409,7 +351,7 @@ void fpu__current_fpstate_write_begin(void) * ensures we will not be lazy and skip a XRSTOR in the * future. */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } /* @@ -459,7 +401,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); - fpu->counter++; trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } @@ -477,7 +418,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ @@ -521,14 +461,14 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { - /* FPU state will be reallocated lazily at the first use. */ - fpu__drop(fpu); - } else { - if (!fpu->fpstate_active) { - fpu__activate_curr(fpu); - user_fpu_begin(); - } + fpu__drop(fpu); + + /* + * Make sure fpstate is cleared and initialized. + */ + if (static_cpu_has(X86_FEATURE_FPU)) { + fpu__activate_curr(fpu); + user_fpu_begin(); copy_init_fpstate_to_fpregs(); } } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2f2b8c7ccb85..60dece392b3a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -10,18 +10,6 @@ #include <linux/init.h> /* - * Initialize the TS bit in CR0 according to the style of context-switches - * we are using: - */ -static void fpu__init_cpu_ctx_switch(void) -{ - if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) - stts(); - else - clts(); -} - -/* * Initialize the registers found in all CPUs, CR0 and CR4: */ static void fpu__init_cpu_generic(void) @@ -58,7 +46,6 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); - fpu__init_cpu_ctx_switch(); } /* @@ -233,82 +220,16 @@ static void __init fpu__init_system_xstate_size_legacy(void) } /* - * FPU context switching strategies: - * - * Against popular belief, we don't do lazy FPU saves, due to the - * task migration complications it brings on SMP - we only do - * lazy FPU restores. - * - * 'lazy' is the traditional strategy, which is based on setting - * CR0::TS to 1 during context-switch (instead of doing a full - * restore of the FPU state), which causes the first FPU instruction - * after the context switch (whenever it is executed) to fault - at - * which point we lazily restore the FPU state into FPU registers. - * - * Tasks are of course under no obligation to execute FPU instructions, - * so it can easily happen that another context-switch occurs without - * a single FPU instruction being executed. If we eventually switch - * back to the original task (that still owns the FPU) then we have - * not only saved the restores along the way, but we also have the - * FPU ready to be used for the original task. - * - * 'lazy' is deprecated because it's almost never a performance win - * and it's much more complicated than 'eager'. - * - * 'eager' switching is by default on all CPUs, there we switch the FPU - * state during every context switch, regardless of whether the task - * has used FPU instructions in that time slice or not. This is done - * because modern FPU context saving instructions are able to optimize - * state saving and restoration in hardware: they can detect both - * unused and untouched FPU state and optimize accordingly. - * - * [ Note that even in 'lazy' mode we might optimize context switches - * to use 'eager' restores, if we detect that a task is using the FPU - * frequently. See the fpu->counter logic in fpu/internal.h for that. ] - */ -static enum { ENABLE, DISABLE } eagerfpu = ENABLE; - -/* * Find supported xfeatures based on cpu features and command-line input. * This must be called after fpu__init_parse_early_param() is called and * xfeatures_mask is enumerated. */ u64 __init fpu__get_supported_xfeatures_mask(void) { - /* Support all xfeatures known to us */ - if (eagerfpu != DISABLE) - return XCNTXT_MASK; - - /* Warning of xfeatures being disabled for no eagerfpu mode */ - if (xfeatures_mask & XFEATURE_MASK_EAGER) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XFEATURE_MASK_EAGER); - } - - /* Return a mask that masks out all features requiring eagerfpu mode */ - return ~XFEATURE_MASK_EAGER; + return XCNTXT_MASK; } -/* - * Disable features dependent on eagerfpu. - */ -static void __init fpu__clear_eager_fpu_features(void) -{ - setup_clear_cpu_cap(X86_FEATURE_MPX); -} - -/* - * Pick the FPU context switching strategy: - * - * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of - * the following is true: - * - * (1) the cpu has xsaveopt, as it has the optimization and doing eager - * FPU switching has a relatively low cost compared to a plain xsave; - * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU - * switching. Should the kernel boot with noxsaveopt, we support MPX - * with eager FPU switching at a higher cost. - */ +/* Legacy code to initialize eager fpu mode. */ static void __init fpu__init_system_ctx_switch(void) { static bool on_boot_cpu __initdata = 1; @@ -317,17 +238,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - - if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XFEATURE_MASK_EAGER) - eagerfpu = ENABLE; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } /* @@ -336,11 +246,6 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { - eagerfpu = DISABLE; - fpu__clear_eager_fpu_features(); - } - if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -375,14 +280,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) */ fpu__init_cpu(); - /* - * But don't leave CR0::TS set yet, as some of the FPU setup - * methods depend on being able to execute FPU instructions - * that will fault on a set TS, such as the FXSAVE in - * fpu__init_system_mxcsr(). - */ - clts(); - fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..83c23c230b4c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(fpu); - preempt_enable(); - } + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 124aa5c593f8..1d7770447b3e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); @@ -73,7 +74,10 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); setup_clear_cpu_cap(X86_FEATURE_PKU); + setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); + setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); } /* @@ -888,15 +892,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* - * For most XSAVE components, this would be an arduous task: - * brining fpstate up to date with fpregs, updating fpstate, - * then re-populating fpregs. But, for components that are - * never lazily managed, we can just access the fpregs - * directly. PKRU is never managed lazily, so we can just - * manipulate it directly. Make sure it stays that way. - */ - WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b6b2f0264af3..4e8577d03372 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -63,6 +63,8 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +#define SIZEOF_PTREGS 17*4 + /* * Number of possible pages in the lowmem region. * @@ -248,19 +250,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20); #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) - jb default_entry + jb .Ldefault_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax - jae bad_subarch + jae .Lbad_subarch movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax -bad_subarch: +.Lbad_subarch: WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really @@ -270,14 +272,14 @@ WEAK(xen_entry) __INITDATA subarch_entries: - .long default_entry /* normal x86/PC */ + .long .Ldefault_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ + .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #else - jmp default_entry + jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_HOTPLUG_CPU @@ -289,7 +291,8 @@ num_subarch_entries = (. - subarch_entries) / 4 ENTRY(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp - jmp *(initial_code) + call *(initial_code) +1: jmp 1b ENDPROC(start_cpu0) #endif @@ -317,7 +320,7 @@ ENTRY(startup_32_smp) call load_ucode_ap #endif -default_entry: +.Ldefault_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ X86_CR0_PG) @@ -347,7 +350,7 @@ default_entry: pushfl popl %eax # get EFLAGS testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? - jz enable_paging # hw disallowed setting of ID bit + jz .Lenable_paging # hw disallowed setting of ID bit # which means no CPUID and no CR4 xorl %eax,%eax @@ -357,13 +360,13 @@ default_entry: movl $1,%eax cpuid andl $~1,%edx # Ignore CPUID.FPU - jz enable_paging # No flags or only CPUID.FPU = no CR4 + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz enable_paging + jz .Lenable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -371,7 +374,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja enable_paging + ja .Lenable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -380,7 +383,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc enable_paging + jnc .Lenable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -390,7 +393,7 @@ default_entry: /* Make changes effective */ wrmsr -enable_paging: +.Lenable_paging: /* * Enable paging @@ -419,7 +422,7 @@ enable_paging: */ movb $4,X86 # at least 486 cmpl $-1,X86_CPUID - je is486 + je .Lis486 /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -430,7 +433,7 @@ enable_paging: movl %ecx,X86_VENDOR_ID+8 # last 4 chars orl %eax,%eax # do we have processor info as well? - je is486 + je .Lis486 movl $1,%eax # Use the CPUID instruction to get CPU type cpuid @@ -444,7 +447,7 @@ enable_paging: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: +.Lis486: movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET @@ -470,8 +473,9 @@ is486: xorl %eax,%eax # Clear LDT lldt %ax - pushl $0 # fake return address for unwinder - jmp *(initial_code) + call *(initial_code) +1: jmp 1b +ENDPROC(startup_32_smp) #include "verify_cpu.S" @@ -665,14 +669,17 @@ __PAGE_ALIGNED_BSS initial_pg_pmd: .fill 1024*KPMDS,4,0 #else -ENTRY(initial_page_table) +.globl initial_page_table +initial_page_table: .fill 1024,4,0 #endif initial_pg_fixmap: .fill 1024,4,0 -ENTRY(empty_zero_page) +.globl empty_zero_page +empty_zero_page: .fill 4096,1,0 -ENTRY(swapper_pg_dir) +.globl swapper_pg_dir +swapper_pg_dir: .fill 1024,4,0 EXPORT_SYMBOL(empty_zero_page) @@ -706,7 +713,12 @@ ENTRY(initial_page_table) .data .balign 4 ENTRY(initial_stack) - .long init_thread_union+THREAD_SIZE + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ + TOP_OF_KERNEL_STACK_PADDING; __INITRODATA int_msg: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..b467b14b03eb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,13 +66,8 @@ startup_64: * tables and then reload them. */ - /* - * Setup stack for verify_cpu(). "-8" because initial_stack is defined - * this way, see below. Our best guess is a NULL ptr for stack - * termination heuristics and we don't want to break anything which - * might depend on it (kgdb, ...). - */ - leaq (__end_init_task - 8)(%rip), %rsp + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp /* Sanitize CPU configuration */ call verify_cpu @@ -117,20 +112,20 @@ startup_64: movq %rdi, %rax shrq $PGDIR_SHIFT, %rax - leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx movq %rdx, 0(%rbx,%rax,8) movq %rdx, 8(%rbx,%rax,8) - addq $4096, %rdx + addq $PAGE_SIZE, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) incl %eax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) - addq $8192, %rbx + addq $PAGE_SIZE * 2, %rbx movq %rdi, %rax shrq $PMD_SHIFT, %rdi addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax @@ -147,6 +142,9 @@ startup_64: decl %ecx jnz 1b + test %rbp, %rbp + jz .Lskip_fixup + /* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated @@ -154,9 +152,9 @@ startup_64: * beyond _end. */ leaq level2_kernel_pgt(%rip), %rdi - leaq 4096(%rdi), %r8 + leaq PAGE_SIZE(%rdi), %r8 /* See if it is a valid page table entry */ -1: testb $1, 0(%rdi) +1: testb $_PAGE_PRESENT, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ @@ -167,6 +165,7 @@ startup_64: /* Fixup phys_base */ addq %rbp, phys_base(%rip) +.Lskip_fixup: movq $(early_level4_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) @@ -265,13 +264,17 @@ ENTRY(secondary_startup_64) movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx - wrmsr + wrmsr /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - - /* Finally jump to run C code and to be on real kernel address + jmp start_cpu +ENDPROC(secondary_startup_64) + +ENTRY(start_cpu) + /* + * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this @@ -295,12 +298,14 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder + pushq $.Lafter_lret # put return address on stack for unwinder + xorq %rbp, %rbp # clear frame pointer + movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq -ENDPROC(secondary_startup_64) +.Lafter_lret: +ENDPROC(start_cpu) #include "verify_cpu.S" @@ -308,15 +313,11 @@ ENDPROC(secondary_startup_64) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary(). + * start_secondary() via start_cpu(). */ ENTRY(start_cpu0) - movq initial_stack(%rip),%rsp - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + movq initial_stack(%rip), %rsp + jmp start_cpu ENDPROC(start_cpu0) #endif @@ -328,7 +329,11 @@ ENDPROC(start_cpu0) GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) - .quad init_thread_union+THREAD_SIZE-8 + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA bad_address: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..7c6e9ffe4424 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -14,7 +14,6 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 9ebd0b0e73d9..6b0678a541e2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -16,7 +16,6 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <asm/io_apic.h> -#include <asm/idle.h> #include <asm/apic.h> int sysctl_panic_on_stackoverflow; diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000000..cb9c1ed1d391 --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,215 @@ +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/nodemask.h> + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static unsigned int zero; +static unsigned int one = 1; +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = &zero, + .extra2 = &one, + }, + {} +}; + +static struct ctl_table itmt_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = itmt_kern_table, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl_table(itmt_root_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + if (sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of cpu core + * @core_cpu: The cpu number associated with the core + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPU with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int core_cpu) +{ + int cpu, i = 1; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + int smt_prio; + + /* + * Ensure that the siblings are moved to the end + * of the priority chain and only used when + * all other high priority cpus are out of capacity. + */ + smt_prio = prio * smp_num_siblings / i; + per_cpu(sched_core_priority, cpu) = smt_prio; + i++; + } +} diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 3407b148c240..d0a814a9d96a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel, struct setup_header *header; int setup_sects, kern16_size, ret = 0; - unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; unsigned long purgatory_load_addr; - unsigned long kernel_bufsz, kernel_memsz, kernel_align; - char *kernel_buf; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - params_misc_sz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + efi_map_sz + sizeof(struct setup_data) + sizeof(struct efi_setup_data); - params = kzalloc(params_misc_sz, GFP_KERNEL); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel, /* Is there a limit on setup header size? */ memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); - ret = kexec_add_buffer(image, (char *)params, params_misc_sz, - params_misc_sz, 16, MIN_BOOTPARAM_ADDR, - ULONG_MAX, 1, &bootparam_load_addr); + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + bootparam_load_addr = kbuf.mem; pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - bootparam_load_addr, params_misc_sz, params_misc_sz); + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); /* Load kernel */ - kernel_buf = kernel + kern16_size; - kernel_bufsz = kernel_len - kern16_size; - kernel_memsz = PAGE_ALIGN(header->init_size); - kernel_align = header->kernel_alignment; - - ret = kexec_add_buffer(image, kernel_buf, - kernel_bufsz, kernel_memsz, kernel_align, - MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, - &kernel_load_addr); + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + kernel_load_addr = kbuf.mem; pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_load_addr, kernel_memsz, kernel_memsz); + kernel_load_addr, kbuf.bufsz, kbuf.memsz); /* Load initrd high */ if (initrd) { - ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, - PAGE_SIZE, MIN_INITRD_LOAD_ADDR, - ULONG_MAX, 1, &initrd_load_addr); + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", initrd_load_addr, initrd_len, initrd_len); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..36bc66416021 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,7 +42,6 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> -#include <asm/idle.h> #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> @@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); rcu_irq_exit(); break; @@ -308,7 +305,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { /** * This relies on __test_and_clear_bit to modify the memory @@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic_write(APIC_EOI, APIC_EOI_ACK); + apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); } static void kvm_guest_cpu_init(void) @@ -592,6 +589,14 @@ out: local_irq_restore(flags); } +__visible bool __kvm_vcpu_is_preempted(int cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!src->preempted; +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -608,6 +613,11 @@ void __init kvm_spinlock_init(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + pv_lock_ops.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6707039b9032..d4a15831ac58 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm) } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int size) { struct ldt_struct *new_ldt; - int alloc_size; + unsigned int alloc_size; if (size > LDT_ENTRIES) return NULL; @@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) paravirt_free_ldt(ldt->entries, ldt->size); if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); + vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); @@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int oldsize, newsize; + struct user_desc ldt_info; struct desc_struct ldt; int error; - struct user_desc ldt_info; - int oldsize, newsize; - struct ldt_struct *new_ldt, *old_ldt; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) old_ldt = mm->context.ldt; oldsize = old_ldt ? old_ldt->size : 0; - newsize = max((int)(ldt_info.entry_number + 1), oldsize); + newsize = max(ldt_info.entry_number + 1, oldsize); error = -ENOMEM; new_ldt = alloc_ldt_struct(newsize); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { - VMCOREINFO_SYMBOL(phys_base); + VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA @@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); - VMCOREINFO_VMALLOC_START(VMALLOC_START); - VMCOREINFO_VMEMMAP_START(VMEMMAP_START); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index efe73aacf966..7b0d3da52fb4 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -18,8 +18,10 @@ #ifdef CC_USING_FENTRY # define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) #else # define function_hook mcount +EXPORT_SYMBOL(mcount) #endif /* All cases save the original rbp (8 bytes) */ @@ -295,7 +297,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -EXPORT_SYMBOL(function_hook) #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7f3550acde1b..f5e3ff835cc8 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -44,6 +44,7 @@ #include <asm/msr.h> static struct class *msr_class; +static enum cpuhp_state cpuhp_msr_state; static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -180,7 +181,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int msr_device_create(int cpu) +static int msr_device_create(unsigned int cpu) { struct device *dev; @@ -189,34 +190,12 @@ static int msr_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void msr_device_destroy(int cpu) +static int msr_device_destroy(unsigned int cpu) { device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; } -static int msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = msr_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - msr_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block __refdata msr_class_cpu_notifier = { - .notifier_call = msr_class_cpu_callback, -}; - static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); @@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode) static int __init msr_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { pr_err("unable to get major %d for msr\n", MSR_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } msr_class = class_create(THIS_MODULE, "msr"); if (IS_ERR(msr_class)) { @@ -239,44 +216,28 @@ static int __init msr_init(void) } msr_class->devnode = msr_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = msr_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); - - err = 0; - goto out; + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) - msr_device_destroy(i); - cpu_notifier_register_done(); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); -out: return err; } +module_init(msr_init); static void __exit msr_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - msr_device_destroy(cpu); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - __unregister_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(msr_init); module_exit(msr_exit) MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 2c55a003b793..6d4bf812af45 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); } - PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) @@ -21,12 +20,25 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } +__visible bool __native_vcpu_is_preempted(int cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_lock_ops.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, + .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bbf3d5933eaa..a1bfba0f7234 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, - .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 920c6ae08592..d33ef165b1f8 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { @@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index bb3840cedb4f..f4fcf26c9fce 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) @@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0888a879120f..37363e46b1f0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -23,7 +23,6 @@ #include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> -#include <asm/idle.h> #include <asm/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> @@ -65,23 +64,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -#ifdef CONFIG_X86_64 -static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); -#endif - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -251,39 +233,10 @@ static inline void play_dead(void) } #endif -#ifdef CONFIG_X86_64 -void enter_idle(void) -{ - this_cpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} -#endif - void arch_cpu_idle_enter(void) { + tsc_verify_tsc_adjust(false); local_touch_nmi(); - enter_idle(); -} - -void arch_cpu_idle_exit(void) -{ - __exit_idle(); } void arch_cpu_idle_dead(void) @@ -336,59 +289,33 @@ void stop_this_cpu(void *dummy) halt(); } -bool amd_e400_c1e_detected; -EXPORT_SYMBOL(amd_e400_c1e_detected); - -static cpumask_var_t amd_e400_c1e_mask; - -void amd_e400_remove_cpu(int cpu) -{ - if (amd_e400_c1e_mask != NULL) - cpumask_clear_cpu(cpu, amd_e400_c1e_mask); -} - /* - * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt - * pending message MSR. If we detect C1E, then we handle it the same - * way as C3 power states (local apic timer and TSC stop) + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). */ static void amd_e400_idle(void) { - if (!amd_e400_c1e_detected) { - u32 lo, hi; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - amd_e400_c1e_detected = true; - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) - mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); - } + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; } - if (amd_e400_c1e_detected) { - int cpu = smp_processor_id(); + tick_broadcast_enter(); - if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { - cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* Force broadcast so ACPI can not interfere. */ - tick_broadcast_force(); - pr_info("Switch to broadcast mode on CPU%d\n", cpu); - } - tick_broadcast_enter(); - - default_idle(); + default_idle(); - /* - * The switch back from broadcast mode needs to be - * called with interrupts disabled. - */ - local_irq_disable(); - tick_broadcast_exit(); - local_irq_enable(); - } else - default_idle(); + /* + * The switch back from broadcast mode needs to be called with + * interrupts disabled. + */ + local_irq_disable(); + tick_broadcast_exit(); + local_irq_enable(); } /* @@ -448,8 +375,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c) if (x86_idle || boot_option_idle_override == IDLE_POLL) return; - if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { - /* E400: APIC timer interrupt does not wake up CPU from C1e */ + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; } else if (prefer_mwait_c1_over_halt(c)) { @@ -459,11 +385,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c) x86_idle = default_idle; } -void __init init_amd_e400_c1e_mask(void) +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) { - /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (x86_idle == amd_e400_idle) - zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..d0d744108594 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -49,7 +49,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -72,10 +71,9 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - (u16)regs->cs, regs->ip, regs->flags, - smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); + printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, + smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -232,11 +230,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -295,7 +292,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..a76b65e3e615 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -44,7 +44,6 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -61,10 +60,15 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, - regs->sp, regs->flags); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, + (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, + regs->sp, regs->flags); + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -265,9 +269,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned prev_fsindex, prev_gsindex; - fpu_switch_t fpu_switch; - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -417,7 +420,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->gsbase = 0; prev->gsindex = prev_gsindex; - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); /* * Switch the PDA and FPU contexts. diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 51402a7e4ca6..0bee04d41bed 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -625,8 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#endif - #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) #include <linux/jump_label.h> #include <asm/string_64.h> @@ -657,3 +655,4 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif +#endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 79c6311cd912..5b21cb7d84d6 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now) unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; + /* + * If pm_trace abused the RTC as storage, set the timespec to 0, + * which tells the caller that this RTC value is unusable. + */ + if (!pm_trace_rtc_valid()) { + now->tv_sec = now->tv_nsec = 0; + return; + } + spin_lock_irqsave(&rtc_lock, flags); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbfbca5fea0c..4cfba947d774 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ @@ -1221,11 +1245,16 @@ void __init setup_arch(char **cmdline_p) */ get_smp_config(); + /* + * Systems w/o ACPI and mptables might not have it mapped the local + * APIC yet, but prefill_possible_map() might need to access it. + */ + init_apic_mappings(); + prefill_possible_map(); init_cpu_to_node(); - init_apic_mappings(); io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2bbd27f89802..9820d6d977c6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,7 +1,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/bootmem.h> #include <linux/percpu.h> @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/mpspec.h> #include <asm/apicdef.h> diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 40df33753bae..ec1f756f9dc9 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -105,9 +105,6 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) /* Don't let flags to be set from userspace */ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (user_64bit_mode(current_pt_regs())) - return; - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index c00cb64bc0a1..68f8cc222f25 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,10 +261,8 @@ static inline void __smp_reschedule_interrupt(void) __visible void smp_reschedule_interrupt(struct pt_regs *regs) { - irq_enter(); ack_APIC_irq(); __smp_reschedule_interrupt(); - irq_exit(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 951f093a96fe..46732dc3b73c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -58,7 +58,6 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> @@ -104,11 +103,21 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; -static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -263,9 +272,14 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -int topology_update_package_map(unsigned int apicid, unsigned int cpu) +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + unsigned int new; /* Called from early boot ? */ if (!physical_package_map) @@ -278,16 +292,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - if (logical_packages_frozen) { - physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package max\n", - apicid, pkg); + if (logical_packages >= __max_logical_packages) { + pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", + logical_packages, cpu, __max_logical_packages); return -ENOSPC; } new = logical_packages++; - pr_info("APIC(%x) Converting physical %u to logical package %u\n", - apicid, pkg, new); + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } physical_to_logical_pkg[pkg] = new; found: @@ -308,9 +323,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) } EXPORT_SYMBOL(topology_phys_to_logical_pkg); -static void __init smp_init_package_map(void) +static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) { - unsigned int ncpus, cpu; + unsigned int ncpus; size_t size; /* @@ -355,27 +370,9 @@ static void __init smp_init_package_map(void) size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - for_each_present_cpu(cpu) { - unsigned int apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) - continue; - if (!topology_update_package_map(apicid, cpu)) - continue; - pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); - per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; - set_cpu_possible(cpu, false); - set_cpu_present(cpu, false); - } - - if (logical_packages > __max_logical_packages) { - pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", - logical_packages, __max_logical_packages); - logical_packages_frozen = true; - __max_logical_packages = logical_packages; - } - pr_info("Max logical packages: %u\n", __max_logical_packages); + + topology_update_package_map(c->phys_proc_id, cpu); } void __init smp_store_boot_cpu_info(void) @@ -385,7 +382,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(); + smp_init_package_map(c, id); } /* @@ -471,22 +468,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags() | x86_sched_itmt_flags(); +} +#endif +#endif + static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { NULL, }, }; static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, @@ -821,14 +838,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -void smp_announce(void) -{ - int num_nodes = num_online_nodes(); - - printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", - num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { @@ -964,9 +973,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(idle))) - 1); - + idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; @@ -1111,7 +1118,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_disable_lazy_restore(cpu); + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; common_cpu_up(cpu, tidle); @@ -1331,7 +1338,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); - pr_info("CPU%d: ", 0); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); if (is_uv_system()) @@ -1409,15 +1416,17 @@ __init void prefill_possible_map(void) /* No boot processor was found in mptable or ACPI MADT */ if (!num_processors) { - int apicid = boot_cpu_physical_apicid; - int cpu = hard_smp_processor_id(); + if (boot_cpu_has(X86_FEATURE_APIC)) { + int apicid = boot_cpu_physical_apicid; + int cpu = hard_smp_processor_id(); - pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); + pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); - /* Make sure boot cpu is enumerated */ - if (apic->cpu_present_to_apicid(0) == BAD_APICID && - apic->apic_id_valid(apicid)) - generic_processor_info(apicid, boot_cpu_apic_version); + /* Make sure boot cpu is enumerated */ + if (apic->cpu_present_to_apicid(0) == BAD_APICID && + apic->apic_id_valid(apicid)) + generic_processor_info(apicid, boot_cpu_apic_version); + } if (!num_processors) num_processors = 1; @@ -1454,15 +1463,15 @@ __init void prefill_possible_map(void) possible = i; } + nr_cpu_ids = possible; + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); + reset_cpu_possible_mask(); + for (i = 0; i < possible; i++) set_cpu_possible(i, true); - for (; i < NR_CPUS; i++) - set_cpu_possible(i, false); - - nr_cpu_ids = possible; } #ifdef CONFIG_HOTPLUG_CPU @@ -1573,7 +1582,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - amd_e400_remove_cpu(raw_smp_processor_id()); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 764a29f84de7..85195d447a92 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -66,13 +66,36 @@ __init int create_simplefb(const struct screen_info *si, { struct platform_device *pd; struct resource res; - unsigned long len; + u64 base, size; + u32 length; - /* don't use lfb_size as it may contain the whole VMEM instead of only - * the part that is occupied by the framebuffer */ - len = mode->height * mode->stride; - len = PAGE_ALIGN(len); - if (len > (u64)si->lfb_size << 16) { + /* + * If the 64BIT_BASE capability is set, ext_lfb_base will contain the + * upper half of the base address. Assemble the address, then make sure + * it is valid and we can actually access it. + */ + base = si->lfb_base; + if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)si->ext_lfb_base << 32; + if (!base || (u64)(resource_size_t)base != base) { + printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n"); + return -EINVAL; + } + + /* + * Don't use lfb_size as IORESOURCE size, since it may contain the + * entire VMEM, and thus require huge mappings. Use just the part we + * need, that is, the part where the framebuffer is located. But verify + * that it does not exceed the advertised VMEM. + * Note that in case of VBE, the lfb_size is shifted by 16 bits for + * historical reasons. + */ + size = si->lfb_size; + if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + size <<= 16; + length = mode->height * mode->stride; + length = PAGE_ALIGN(length); + if (length > size) { printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); return -EINVAL; } @@ -81,8 +104,8 @@ __init int create_simplefb(const struct screen_info *si, memset(&res, 0, sizeof(res)); res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; res.name = simplefb_resname; - res.start = si->lfb_base; - res.end = si->lfb_base + len - 1; + res.start = base; + res.end = res.start + length - 1; if (res.end <= res.start) return -EINVAL; diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..15515132bf0d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -34,7 +34,7 @@ static void switch_idt(void *arg) local_irq_restore(flags); } -void trace_irq_vector_regfunc(void) +int trace_irq_vector_regfunc(void) { mutex_lock(&irq_vector_mutex); if (!trace_irq_vector_refcount) { @@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void) } trace_irq_vector_refcount++; mutex_unlock(&irq_vector_mutex); + return 0; } void trace_irq_vector_unregfunc(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bd4e3d4d3625..bf0c6d049080 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -853,6 +853,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { + unsigned long cr0; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION @@ -866,10 +868,20 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - fpu__restore(¤t->thread.fpu); /* interrupts still off */ -#ifdef CONFIG_X86_32 - cond_local_irq_enable(regs); -#endif + + /* This should not happen. */ + cr0 = read_cr0(); + if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { + /* Try to fix it up and carry on. */ + write_cr0(cr0 & ~X86_CR0_TS); + } else { + /* + * Something terrible happened, and we're better off trying + * to kill the task than getting stuck in a never-ending + * loop of #NM faults. + */ + die("unexpected #NM exception", regs, error_code); + } } NOKPROBE_SYMBOL(do_device_not_available); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..0aed75a1e31b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -702,6 +702,20 @@ unsigned long native_calibrate_tsc(void) } } + /* + * TSC frequency determined by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return crystal_khz * ebx_numerator / eax_denominator; } @@ -1043,18 +1057,20 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); - - /* Don't enable ART in a VM, non-stop TSC required */ + /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - art_to_tsc_denominator < ART_MIN_DENOMINATOR) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; - if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } @@ -1064,6 +1080,11 @@ static void detect_art(void) static struct clocksource clocksource_tsc; +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a @@ -1096,6 +1117,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, }; void mark_tsc_unstable(char *reason) @@ -1283,10 +1305,10 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* - * Trust the results of the earlier calibration on systems - * exporting a reliable TSC. + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1363,6 +1385,8 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); + else + tsc_store_and_check_tsc_adjust(true); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..19afdbd7d0a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void) #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; #endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return res; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 78083bf23ed1..d0db011051a5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -14,18 +14,166 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> #include <asm/tsc.h> +struct tsc_adjust { + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); + +void tsc_verify_tsc_adjust(bool resume) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Rate limit the MSR check */ + if (!resume && time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned || resume) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * But we always force positive ADJUST values. Otherwise the TSC + * deadline timer creates an interrupt storm. We also have to + * prevent values > 0x7FFFFFFF as those wreckage the timer as well. + */ + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || + (bootval > 0x7FFFFFFF)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, + bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } + cur->adjusted = bootval; +} + +#ifndef CONFIG_SMP +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); + return false; +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +bool tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. + */ + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; + + if (refcpu >= nr_cpu_ids) { + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); + return false; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) { + pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->bootval, cpu, bootval); + } + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->adjusted, cpu, bootval); + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; +} + /* * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ static atomic_t start_count; static atomic_t stop_count; +static atomic_t skip_test; +static atomic_t test_runs; /* * We use a raw spinlock in this exceptional case, because @@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static cycles_t last_tsc; static cycles_t max_warp; static int nr_warps; +static int random_warps; /* * TSC-warp measurement loop running on both CPUs. This is not called * if there is no TSC. */ -static void check_tsc_warp(unsigned int timeout) +static cycles_t check_tsc_warp(unsigned int timeout) { - cycles_t start, now, prev, end; - int i; + cycles_t start, now, prev, end, cur_max_warp = 0; + int i, cur_warps = 0; start = rdtsc_ordered(); /* @@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; nr_warps++; + cur_warps = nr_warps; arch_spin_unlock(&sync_lock); } } WARN(!(now-start), "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); + return cur_max_warp; } /* @@ -136,15 +294,26 @@ void check_tsc_sync_source(int cpu) } /* - * Reset it - in case this is a second bootup: + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust */ - atomic_set(&stop_count, 0); - + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: /* - * Wait for the target to arrive: + * Wait for the target to start or to skip the test: */ - while (atomic_read(&start_count) != cpus-1) + while (atomic_read(&start_count) != cpus - 1) { + if (atomic_read(&skip_test) > 0) { + atomic_set(&skip_test, 0); + return; + } cpu_relax(); + } + /* * Trigger the target to continue into the measurement too: */ @@ -155,21 +324,35 @@ void check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - if (nr_warps) { + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); } /* * Reset it - just in case we boot another CPU later: */ atomic_set(&start_count, 0); + random_warps = 0; nr_warps = 0; max_warp = 0; last_tsc = 0; @@ -178,6 +361,12 @@ void check_tsc_sync_source(int cpu) * Let the target continue with the bootup: */ atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; } /* @@ -185,6 +374,9 @@ void check_tsc_sync_source(int cpu) */ void check_tsc_sync_target(void) { + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; int cpus = 2; /* Also aborts if there is no TSC. */ @@ -192,6 +384,16 @@ void check_tsc_sync_target(void) return; /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + */ + if (tsc_store_and_check_tsc_adjust(false)) { + atomic_inc(&skip_test); + return; + } + +retry: + /* * Register this CPU's participation and wait for the * source CPU to start the measurement: */ @@ -199,7 +401,12 @@ void check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(loop_timeout(smp_processor_id())); + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; /* * Ok, we are done: @@ -211,4 +418,61 @@ void check_tsc_sync_target(void) */ while (atomic_read(&stop_count) != cpus) cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustement value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + /* + * TSC deadline timer stops working or creates an interrupt storm + * with adjust values < 0 and > x07ffffff. + * + * To allow adjust values > 0x7FFFFFFF we need to disable the + * deadline timer and use the local APIC timer, but that requires + * more intrusive changes and we do not have any useful information + * from Intel about the underlying HW wreckage yet. + */ + if (cur->adjusted < 0) + cur->adjusted = 0; + if (cur->adjusted > 0x7FFFFFFF) + cur->adjusted = 0x7FFFFFFF; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + } + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a2456d4d286a..ea7b7f9a3b9e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -14,13 +14,55 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (unwind_done(state)) return 0; + if (state->regs && user_mode(state->regs)) + return 0; + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, addr_p); - return __kernel_text_address(addr) ? addr : 0; + if (!__kernel_text_address(addr)) { + printk_deferred_once(KERN_WARNING + "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n", + (void *)addr, addr_p, state->task->comm, + state->task->pid); + return 0; + } + + return addr; } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static size_t regs_size(struct pt_regs *regs) +{ + /* x86_32 regs from kernel mode are two words shorter: */ + if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) + return sizeof(*regs) - 2*sizeof(long); + + return sizeof(*regs); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + unsigned long bp = (unsigned long)state->bp; + unsigned long regs = (unsigned long)task_pt_regs(state->task); + + return bp == regs - FRAME_HEADER_SIZE; +} + +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} + static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { @@ -43,26 +85,117 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { - unsigned long *next_bp; + struct pt_regs *regs; + unsigned long *next_bp, *next_frame; + size_t next_len; + enum stack_type prev_type = state->stack_info.type; if (unwind_done(state)) return false; - next_bp = (unsigned long *)*state->bp; + /* have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread_tls(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * do_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + return true; + } + + /* get the next frame pointer */ + if (state->regs) + next_bp = (unsigned long *)state->regs->bp; + else + next_bp = (unsigned long *)*state->bp; + + /* is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + next_frame = (unsigned long *)regs; + next_len = sizeof(*regs); + } else { + next_frame = next_bp; + next_len = FRAME_HEADER_SIZE; + } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) - return false; + if (!update_stack_state(state, next_frame, next_len)) { + /* + * Don't warn on bad regs->bp. An interrupt in entry code + * might cause a false positive warning. + */ + if (state->regs) + goto the_end; + + goto bad_address; + } + + /* Make sure it only unwinds up and doesn't overlap the last frame: */ + if (state->stack_info.type == prev_type) { + if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) + goto bad_address; + + if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) + goto bad_address; + } /* move to the next frame */ - state->bp = next_bp; + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + return true; + +bad_address: + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_frame); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_frame); + } +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; } EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + unsigned long *bp, *frame; + size_t len; + memset(state, 0, sizeof(*state)); state->task = task; @@ -73,12 +206,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } /* set up the starting stack frame */ - state->bp = get_frame_pointer(task, regs); + bp = get_frame_pointer(task, regs); + regs = decode_frame_pointer(bp); + if (regs) { + state->regs = regs; + frame = (unsigned long *)regs; + len = sizeof(*regs); + } else { + state->bp = bp; + frame = bp; + len = FRAME_HEADER_SIZE; + } /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(state->bp, state->task, &state->stack_info, + get_stack_info(frame, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + update_stack_state(state, frame, len); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 9298993dc8b7..22881ddcbb9f 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -7,11 +7,15 @@ unsigned long unwind_get_return_address(struct unwind_state *state) { + unsigned long addr; + if (unwind_done(state)) return 0; + addr = READ_ONCE_NOCHECK(*state->sp); + return ftrace_graph_ret_addr(state->task, &state->graph_idx, - *state->sp, state->sp); + addr, state->sp); } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -23,9 +27,12 @@ bool unwind_next_frame(struct unwind_state *state) return false; do { - for (state->sp++; state->sp < info->end; state->sp++) - if (__kernel_text_address(*state->sp)) + for (state->sp++; state->sp < info->end; state->sp++) { + unsigned long addr = READ_ONCE_NOCHECK(*state->sp); + + if (__kernel_text_address(addr)) return true; + } state->sp = info->next_sp; @@ -47,7 +54,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); - if (!__kernel_text_address(*first_frame)) + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + if (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + !__kernel_text_address(*first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index dbf67f64d5ec..e79f15f108a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -91,10 +91,10 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; /* bootstrapping code */ HEAD_TEXT . = ALIGN(8); - _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb596cd..b2d3cf1ef54a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,7 +16,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ +#include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -65,6 +65,11 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) +/* These are scattered features in cpufeatures.h. */ +#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +#define KVM_CPUID_BIT_AVX512_4FMAPS 3 +#define KF(x) bit(KVM_CPUID_BIT_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -81,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= F(OSXSAVE); } + best->edx &= ~F(APIC); + if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) + best->edx |= F(APIC); + if (apic) { if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -114,8 +123,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); + kvm_x86_ops->fpu_activate(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical @@ -376,6 +384,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -458,12 +470,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: @@ -863,17 +877,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } EXPORT_SYMBOL_GPL(kvm_cpuid); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - u32 function, eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx; - function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e95d3eb2955..56628a44668b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -158,9 +158,11 @@ #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ -#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ +#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ @@ -446,6 +448,26 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; +/* + * XXX: inoutclob user must know where the argument is being expanded. + * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + */ +#define asm_safe(insn, inoutclob...) \ +({ \ + int _fault = 0; \ + \ + asm volatile("1:" insn "\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: movl $1, %[_fault]\n" \ + " jmp 2b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [_fault] "+qm"(_fault) inoutclob ); \ + \ + _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ +}) + static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is - * subject to the same check. + * subject to the same check. FXSAVE and FXRSTOR are checked here too as their + * 512 bytes of data must be aligned to a 16 byte boundary. */ -static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (likely(size < 16)) - return false; + u64 alignment = ctxt->d & AlignMask; - if (ctxt->d & Aligned) - return true; - else if (ctxt->d & Unaligned) - return false; - else if (ctxt->d & Avx) - return false; - else - return true; + if (likely(size < 16)) + return 1; + + switch (alignment) { + case Unaligned: + case Avx: + return 1; + case Aligned16: + return 16; + case Aligned: + default: + return size; + } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, @@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: @@ -2105,16 +2132,10 @@ static int em_iret(struct x86_emulate_ctxt *ctxt) static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned short sel, old_sel; - struct desc_struct old_desc, new_desc; - const struct x86_emulate_ops *ops = ctxt->ops; + unsigned short sel; + struct desc_struct new_desc; u8 cpl = ctxt->ops->cpl(ctxt); - /* Assignment of RIP may only fail in 64-bit mode */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - ops->get_segment(ctxt, &old_sel, &old_desc, NULL, - VCPU_SREG_CS); - memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, @@ -2124,12 +2145,10 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return rc; rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); - if (rc != X86EMUL_CONTINUE) { - WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); - /* assigning eip failed; restore the old cs */ - ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); - return rc; - } + /* Error handling is not implemented. */ + if (rc != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + return rc; } @@ -2189,14 +2208,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long eip, cs; - u16 old_cs; int cpl = ctxt->ops->cpl(ctxt); - struct desc_struct old_desc, new_desc; - const struct x86_emulate_ops *ops = ctxt->ops; - - if (ctxt->mode == X86EMUL_MODE_PROT64) - ops->get_segment(ctxt, &old_cs, &old_desc, NULL, - VCPU_SREG_CS); + struct desc_struct new_desc; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2213,10 +2226,10 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; rc = assign_eip_far(ctxt, eip, &new_desc); - if (rc != X86EMUL_CONTINUE) { - WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); - ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); - } + /* Error handling is not implemented. */ + if (rc != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + return rc; } @@ -3856,6 +3869,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_fxsr(struct x86_emulate_ctxt *ctxt) +{ + u32 eax = 1, ebx, ecx = 0, edx; + + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(edx & FFL(FXSR))) + return emulate_ud(ctxt); + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + /* + * Don't emulate a case that should never be hit, instead of working + * around a lack of fxsave64/fxrstor64 on old compilers. + */ + if (ctxt->mode >= X86EMUL_MODE_PROT64) + return X86EMUL_UNHANDLEABLE; + + return X86EMUL_CONTINUE; +} + +/* + * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, + * 1) 16 bit mode + * 2) 32 bit mode + * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs + * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. + * save and restore + * 3) 64-bit mode with REX.W prefix + * - like (2), but XMM 8-15 are being saved and restored + * 4) 64-bit mode without REX.W prefix + * - like (3), but FIP and FDP are 64 bit + * + * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the + * desired result. (4) is not emulated. + * + * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS + * and FPU DS) should match. + */ +static int em_fxsave(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + size_t size; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->ops->get_fpu(ctxt); + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) + size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); + else + size = offsetof(struct fxregs_state, xmm_space[0]); + + return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); +} + +static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, + struct fxregs_state *new) +{ + int rc = X86EMUL_CONTINUE; + struct fxregs_state old; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* + * 64 bit host will restore XMM 8-15, which is not correct on non-64 + * bit guests. Load the current values in order to preserve 64 bit + * XMMs after fxrstor. + */ +#ifdef CONFIG_X86_64 + /* XXX: accessing XMM 8-15 very awkwardly */ + memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); +#endif + + /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but + * does save and restore MXCSR. + */ + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) + memcpy(new->xmm_space, old.xmm_space, 8 * 16); + + return rc; +} + +static int em_fxrstor(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (fx_state.mxcsr >> 16) + return emulate_gp(ctxt, 0); + + ctxt->ops->get_fpu(ctxt); + + if (ctxt->mode < X86EMUL_MODE_PROT64) + rc = fxrstor_fixup(ctxt, &fx_state); + + if (rc == X86EMUL_CONTINUE) + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + return rc; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4208,7 +4346,9 @@ static const struct gprefix pfx_0f_ae_7 = { }; static const struct group_dual group15 = { { - N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), + I(ModRM | Aligned16, em_fxsave), + I(ModRM | Aligned16, em_fxrstor), + N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; @@ -5045,7 +5185,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); - if (ctxt->rip_relative) + if (ctxt->rip_relative && likely(ctxt->memopp)) ctxt->memopp->addr.mem.ea = address_mask(ctxt, ctxt->memopp->addr.mem.ea + ctxt->_eip); @@ -5080,21 +5220,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { - bool fault = false; + int rc; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + rc = asm_safe("fwait"); ctxt->ops->put_fpu(ctxt); - if (unlikely(fault)) + if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 42b1c83741c8..99cde5220e07 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) return ret; } -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 16a7134eedac..a78b445ce411 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); + kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - kthread_queue_work(&pt->worker, &pt->expired); + kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) + pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); @@ -713,7 +711,7 @@ fail_register_speaker: fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: @@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 2f5af0798326..600bee9dcbbd 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,8 +44,7 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; + struct kthread_worker *worker; struct kthread_work expired; }; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index c7220ba94aa7..6e219e5c07d2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -594,7 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->irr = 0; ioapic->irr_delivered = 0; ioapic->id = 0; - memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); + memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi)); rtc_irq_eoi_tracking_reset(ioapic); } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 7d2692a49657..1cc6e54436db 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -42,13 +42,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPUS); + DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPUS]; + u8 vectors[KVM_MAX_VCPU_ID]; }; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 25810b144b58..6c0191615f23 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -41,6 +41,15 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); + + /* + * XXX: rejecting pic routes when pic isn't in use would be better, + * but the default routing table is installed while kvm->arch.vpic is + * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. + */ + if (!pic) + return -1; + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -49,6 +58,10 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (!ioapic) + return -1; + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -156,6 +169,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } +static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -1; + + return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); +} + int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -163,18 +186,26 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq irq; int r; - if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) - return -EWOULDBLOCK; + switch (e->type) { + case KVM_IRQ_ROUTING_HV_SINT: + return kvm_hv_set_sint(e, kvm, irq_source_id, level, + line_status); - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; + case KVM_IRQ_ROUTING_MSI: + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; - kvm_set_msi_irq(kvm, e, &irq); + kvm_set_msi_irq(kvm, e, &irq); - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) - return r; - else - return -EWOULDBLOCK; + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + return r; + break; + + default: + break; + } + + return -EWOULDBLOCK; } int kvm_request_irq_source_id(struct kvm *kvm) @@ -254,16 +285,6 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } -static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (!level) - return -1; - - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); -} - int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) @@ -423,18 +444,6 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, srcu_read_unlock(&kvm->irq_srcu, idx); } -int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, - int irq_source_id, int level, bool line_status) -{ - switch (irq->type) { - case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_set_sint(irq, kvm, irq_source_id, level, - line_status); - default: - return -EWOULDBLOCK; - } -} - void kvm_arch_irq_routing_update(struct kvm *kvm) { kvm_hv_irq_routing_update(kvm); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 23b99f305382..34a66b2d47e6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,7 +138,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, *mask = dest_id & 0xff; return true; case KVM_APIC_MODE_XAPIC_CLUSTER: - *cluster = map->xapic_cluster_map[dest_id >> 4]; + *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; default: @@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) u32 i, pir_val; for (i = 0; i <= 7; i++) { - pir_val = xchg(&pir[i], 0); - if (pir_val) + pir_val = READ_ONCE(pir[i]); + if (pir_val) { + pir_val = xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + } } } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); @@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - ktime_t remaining; + ktime_t remaining, now; s64 ns; u32 tmcct; @@ -1101,7 +1103,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) apic->lapic_timer.period == 0) return 0; - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); @@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); - now = apic->lapic_timer.timer.base->get_time(); + now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; @@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (apic_lvtt_oneshot(apic) && + ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + return; + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); +} + +static bool set_target_expiration(struct kvm_lapic *apic) +{ + ktime_t now; + u64 tscl = rdtsc(); + + now = ktime_get(); + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return false; + + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + kvm_lapic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); + + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + + return true; +} + +static void advance_periodic_target_expiration(struct kvm_lapic *apic) +{ + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = + ktime_add_ns(apic->lapic_timer.target_expiration, + apic->lapic_timer.period); +} + bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) @@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +static void cancel_hv_timer(struct kvm_lapic *apic) { kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) +static bool start_hv_timer(struct kvm_lapic *apic) { u64 tscdeadline = apic->lapic_timer.tscdeadline; - if (atomic_read(&apic->lapic_timer.pending) || + if ((atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) || kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); } else { apic->lapic_timer.hv_timer_in_use = true; hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); + if (atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) + cancel_hv_timer(apic); } trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); return apic->lapic_timer.hv_timer_in_use; } +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_timer(apic); + apic_timer_expired(apic); + + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + advance_periodic_target_expiration(apic); + if (!start_hv_timer(apic)) + start_sw_period(apic); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); + start_hv_timer(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); if (atomic_read(&apic->lapic_timer.pending)) return; - start_sw_tscdeadline(apic); + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or periodic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS_PINNED); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); + if (set_target_expiration(apic) && + !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) + start_sw_period(apic); } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) start_sw_tscdeadline(apic); } } @@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!lapic_in_kernel(vcpu)) + return 0; + + return apic->lapic_timer.tscdeadline; +} u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!lapic_in_kernel(vcpu) || + !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) { + if (!apic) value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } vcpu->arch.apic_base = value; + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) + kvm_update_cpuid(vcpu); + + if (!apic) + return; + /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { @@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic); if (lapic_is_periodic(apic)) { + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) apic->lapic_timer.tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; + apic->lapic_timer.target_expiration = ktime_set(0, 0); + } atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c29d51..e0c80233b3e1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -15,6 +15,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + ktime_t target_expiration; u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; @@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..7012de4a1fed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) { - /* - * We are holding the kvm->mmu_lock, and we are blowing up - * shadow PTEs. MMU notifier consumers need to be kept at bay. - * This is correct as long as we don't decouple the mmu_lock - * protected regions (like invalidate_range_start|end does). - */ - kvm->mmu_notifier_seq++; + if (!shadow_accessed_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - } return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -4405,7 +4397,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) } static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -4508,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4527,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: @@ -4617,11 +4626,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); } @@ -4958,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) * zap all shadow pages. */ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { - printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..4a1c13eaa518 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -106,6 +106,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception @@ -135,6 +136,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. @@ -181,6 +183,7 @@ kvm_page_track_register_notifier(struct kvm *kvm, hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of @@ -199,6 +202,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } +EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is @@ -222,6 +226,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes); + n->track_write(vcpu, gpa, new, bytes, n); + srcu_read_unlock(&head->track_srcu, idx); +} + +/* + * Notify the node that memory slot is being removed or moved so that it can + * drop write-protection for the pages in the memory slot. + * + * The node should figure out it has any write-protected pages in this slot + * by itself. + */ +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_flush_slot) + n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8157a36ab09..08a4d3ab3455 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1138,21 +1138,6 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.tsc_offset += adjustment; - if (is_guest_mode(vcpu)) - svm->nested.hsave->control.tsc_offset += adjustment; - else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - adjustment, - svm->vmcb->control.tsc_offset); - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; @@ -2089,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) { @@ -2285,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; @@ -2293,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + return in ? kvm_fast_pio_in(vcpu, size, port) + : kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -3165,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm) static int wbinvd_interception(struct vcpu_svm *svm) { - kvm_emulate_wbinvd(&svm->vcpu); - return 1; + return kvm_emulate_wbinvd(&svm->vcpu); } static int xsetbv_interception(struct vcpu_svm *svm) @@ -3253,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; + return kvm_emulate_cpuid(&svm->vcpu); } static int iret_interception(struct vcpu_svm *svm) @@ -3290,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, @@ -3389,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm) } kvm_register_write(&svm->vcpu, reg, val); } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static int dr_interception(struct vcpu_svm *svm) @@ -3449,12 +3429,6 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) -{ - struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + host_tsc; -} - static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5422,8 +5396,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, - .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cf1b16dbc98a..aae43c6f2472 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -187,6 +203,7 @@ struct vmcs { */ struct loaded_vmcs { struct vmcs *vmcs; + struct vmcs *shadow_vmcs; int cpu; int launched; struct list_head loaded_vmcss_on_cpu_link; @@ -411,7 +428,6 @@ struct nested_vmx { * memory during VMXOFF, VMCLEAR, VMPTRLD. */ struct vmcs12 *cached_vmcs12; - struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 @@ -421,7 +437,6 @@ struct nested_vmx { /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; int vmcs02_num; - u64 vmcs01_tsc_offset; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; @@ -447,23 +462,31 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; u32 nested_vmx_secondary_ctls_low; u32 nested_vmx_secondary_ctls_high; u32 nested_vmx_pinbased_ctls_low; u32 nested_vmx_pinbased_ctls_high; u32 nested_vmx_exit_ctls_low; u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; u32 nested_vmx_entry_ctls_low; u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -521,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -921,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, + VMX_MSR_BITMAP_LEGACY, + VMX_MSR_BITMAP_LONGMODE, + VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, + VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, + VMX_MSR_BITMAP_LEGACY_X2APIC, + VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) +#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -1419,6 +1464,8 @@ static void vmcs_clear(struct vmcs *vmcs) static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; } @@ -2144,12 +2191,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - /* - * If the FPU is not active (through the host task or - * the guest vcpu), then restore the cr0.TS bit. - */ - if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) - stts(); load_gdt(this_cpu_ptr(&host_gdt)); } @@ -2528,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; } else { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; } } else { if (is_long_mode(vcpu)) @@ -2605,20 +2646,6 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) } /* - * Like guest_read_tsc, but always returns L1's notion of the timestamp - * counter, even if a nested guest (L2) is currently running. - */ -static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) -{ - u64 tsc_offset; - - tsc_offset = is_guest_mode(vcpu) ? - to_vmx(vcpu)->nested.vmcs01_tsc_offset : - vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; -} - -/* * writes 'offset' into guest's timestamp counter offset register */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -2631,7 +2658,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ struct vmcs12 *vmcs12; - to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); vmcs_write64(TSC_OFFSET, offset + @@ -2644,19 +2670,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) -{ - u64 offset = vmcs_read64(TSC_OFFSET); - - vmcs_write64(TSC_OFFSET, offset + adjustment); - if (is_guest_mode(vcpu)) { - /* Even when running L2, the adjustment needs to apply to L1 */ - to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, - offset + adjustment); -} - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -2739,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_true_exit_ctls_low = - vmx->nested.nested_vmx_exit_ctls_low & - ~VM_EXIT_SAVE_DEBUG_CONTROLS; + vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2760,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_true_entry_ctls_low = - vmx->nested.nested_vmx_entry_ctls_low & - ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2795,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_true_procbased_ctls_low = - vmx->nested.nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ @@ -2807,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2838,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + VMX_VPID_EXTENT_SUPPORTED_MASK; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -2856,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; + return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) @@ -2871,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The rest of the VMX capability MSRs do not support restore. */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3134,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3562,6 +3810,7 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } static void free_kvm_area(void) @@ -3895,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -4023,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!nested_vmx_allowed(vcpu)) return 1; } - if (to_vmx(vcpu)->nested.vmxon && - ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -4601,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4691,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) { if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); + msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); + msr, type); } } @@ -4854,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!pi_test_and_clear_on(&vmx->pi_desc)) + if (!pi_test_on(&vmx->pi_desc)) return; + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } @@ -4871,9 +5095,11 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr4; + unsigned long cr0, cr4; - vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ @@ -5613,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string; + int size, in, string, ret; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5627,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu) port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - return kvm_fast_pio_out(vcpu, size, port); + ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_fast_pio_out(vcpu, size, port) && ret; } static void @@ -5643,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - always_on &= ~(X86_CR0_PE | X86_CR0_PG); - return (val & always_on) == always_on; -} - /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -5673,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vcpu, val)) + if (!nested_guest_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5682,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) return 0; } else { if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + !nested_host_cr0_valid(vcpu, val)) return 1; + return kvm_set_cr0(vcpu, val); } } @@ -5727,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int cr; int reg; int err; + int ret; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -5738,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu) switch (cr) { case 0: err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 3: err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) - return 1; + return ret; if (cr8_prev <= cr8) - return 1; + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -5765,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ @@ -5789,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); default: break; } @@ -5861,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) @@ -5894,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) static int handle_cpuid(struct kvm_vcpu *vcpu) { - kvm_emulate_cpuid(vcpu); - return 1; + return kvm_emulate_cpuid(vcpu); } static int handle_rdmsr(struct kvm_vcpu *vcpu) @@ -5916,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) @@ -5937,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) } trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -5982,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_rdpmc(struct kvm_vcpu *vcpu) @@ -5991,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) int err; err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; + return kvm_complete_insn_gp(vcpu, err); } static int handle_wbinvd(struct kvm_vcpu *vcpu) { - kvm_emulate_wbinvd(vcpu); - return 1; + return kvm_emulate_wbinvd(vcpu); } static int handle_xsetbv(struct kvm_vcpu *vcpu) @@ -6008,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); return 1; } static int handle_xsaves(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } @@ -6042,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } } return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -6191,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - skip_emulated_instruction(vcpu); trace_kvm_fast_mmio(gpa); - return 1; + return kvm_skip_emulated_instruction(vcpu); } ret = handle_mmio_page_fault(vcpu, gpa, true); @@ -6378,50 +6587,13 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return r; + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out7; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out8; - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6439,7 +6611,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6502,39 +6674,34 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - memcpy(vmx_msr_bitmap_legacy_x2apic, + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + for (msr = 0x800; msr <= 0x8ff; msr++) { + if (msr == 0x839 /* TMCCT */) + continue; + vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); + } + /* - * enable_apicv && kvm_vcpu_apicv_active() + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); + vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); + vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6581,42 +6748,19 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); return r; } static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + int i; + + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); free_kvm_area(); } @@ -6630,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); - - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_nop(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_mwait(struct kvm_vcpu *vcpu) @@ -6696,6 +6837,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); + item->vmcs02.shadow_vmcs = NULL; if (!item->vmcs02.vmcs) { kfree(item); return NULL; @@ -6945,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } page = nested_get_page(vcpu, vmptr); @@ -6954,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, *(u32 *)kmap(page) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); kunmap(page); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } kunmap(page); vmx->nested.vmxon_ptr = vmptr; @@ -6964,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case EXIT_REASON_VMPTRLD: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; default: @@ -7043,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) @@ -7072,7 +7207,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ vmcs_clear(shadow_vmcs); - vmx->nested.current_shadow_vmcs = shadow_vmcs; + vmx->vmcs01.shadow_vmcs = shadow_vmcs; } INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); @@ -7084,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); @@ -7174,8 +7308,11 @@ static void free_nested(struct vcpu_vmx *vmx) free_page((unsigned long)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } - if (enable_shadow_vmcs) - free_vmcs(vmx->nested.current_shadow_vmcs); + if (enable_shadow_vmcs) { + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { @@ -7202,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -7243,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -7352,7 +7487,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) int i; unsigned long field; u64 field_value; - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; @@ -7401,7 +7536,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) int i, q; unsigned long field; u64 field_value = 0; - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; vmcs_load(shadow_vmcs); @@ -7443,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == -1ull) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); return 0; } return 1; @@ -7457,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* * Now copy part of this value to register or memory, as requested. @@ -7487,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } @@ -7507,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); @@ -7530,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRLD instruction */ @@ -7563,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) page = nested_get_page(vcpu, vmptr); if (page == NULL) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { @@ -7572,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); @@ -7591,14 +7722,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->nested.current_shadow_vmcs)); + __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRST instruction */ @@ -7623,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the INVEPT instruction */ @@ -7659,11 +7788,10 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; - if (!(types & (1UL << type))) { + if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* According to the Intel VMX instruction reference, the memory @@ -7694,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) break; } - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -7720,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + types = (vmx->nested.nested_vmx_vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; - if (!(types & (1UL << type))) { + if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* according to the intel vmx instruction reference, the memory @@ -7742,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: case VMX_VPID_EXTENT_SINGLE_CONTEXT: - /* - * Old versions of KVM use the single-context version so we - * have to support it; just treat it the same as all-context. - */ + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!vpid) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); - nested_vmx_succeed(vcpu); break; default: - /* Trap individual address invalidation invvpid calls */ - BUG_ON(1); - break; + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); } - skip_emulated_instruction(vcpu); - return 1; + __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + nested_vmx_succeed(vcpu); + + return kvm_skip_emulated_instruction(vcpu); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -8097,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -8646,11 +8778,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); register void *__sp asm(_ASM_SP); - /* - * If external interrupt exists, IF bit is set in rflags/eflags on the - * interrupt stack frame, and interrupt will be enabled on a return - * from interrupt handler. - */ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; @@ -8835,7 +8962,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; @@ -9156,6 +9283,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); + vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; if (!vmm_exclusive) @@ -9304,6 +9432,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) (new_ctl & ~mask) | (cur_ctl & mask)); } +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; + vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ + cr4_fixed1_update(bit(11), ecx, bit(2)); + +#undef cr4_fixed1_update +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -9345,6 +9517,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9799,6 +9974,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + unsigned long *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -9807,11 +10025,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + unsigned long *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -9976,6 +10198,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_intr_status); } + nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9989,6 +10212,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_constant_host_state(vmx); /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant @@ -10061,9 +10293,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + vcpu->arch.tsc_offset + vmcs12->tsc_offset); else - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -10094,15 +10326,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) nested_ept_init_mmu_context(vcpu); } - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - /* * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified * TS bit (for lazy fpu) and bits which we consider mandatory enabled. @@ -10117,8 +10340,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + entry_failure_code)) + return 1; + kvm_mmu_reset_context(vcpu); if (!enable_ept) @@ -10136,6 +10371,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; } /* @@ -10150,12 +10386,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct loaded_vmcs *vmcs02; bool ia32e; u32 msr_entry_idx; + unsigned long exit_qualification; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; - skip_emulated_instruction(vcpu); + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + vmcs12 = get_vmcs12(vcpu); if (enable_shadow_vmcs) @@ -10175,37 +10413,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; + goto out; } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.nested_vmx_secondary_ctls_low, @@ -10214,33 +10452,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; + goto out; } - if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } if (vmcs12->vmcs_link_pointer != -1ull) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); - return 1; + goto out; } /* @@ -10260,7 +10499,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10278,7 +10517,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10291,10 +10530,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!vmcs02) return -ENOMEM; + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions. Don't call + * kvm_skip_emulated_instruction. + */ + skip_emulated_instruction(vcpu); enter_guest_mode(vcpu); - vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); @@ -10307,7 +10550,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - prepare_vmcs02(vcpu, vmcs12); + if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qualification); + return 1; + } msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, @@ -10334,6 +10583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 1; + +out: + return kvm_skip_emulated_instruction(vcpu); } /* @@ -10639,6 +10891,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + unsigned long entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -10676,8 +10929,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_ept_uninit_mmu_context(vcpu); - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; @@ -10778,6 +11035,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -10790,6 +11048,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (unlikely(vmx->fail)) + vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -10818,7 +11079,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); @@ -10866,7 +11129,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vm_inst_error); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) @@ -11339,8 +11602,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, - .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c633de84dd7..1f0d2383f5ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -210,7 +210,18 @@ static void kvm_on_user_return(struct user_return_notifier *urn) struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); struct kvm_shared_msr_values *values; + unsigned long flags; + /* + * Disabling irqs at this point since the following code could be + * interrupted and executed through kvm_arch_hardware_disable() + */ + local_irq_save(flags); + if (locals->registered) { + locals->registered = false; + user_return_notifier_unregister(urn); + } + local_irq_restore(flags); for (slot = 0; slot < shared_msrs_global.nr; ++slot) { values = &locals->values[slot]; if (values->host != values->curr) { @@ -218,8 +229,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn) values->curr = values->host; } } - locals->registered = false; - user_return_notifier_unregister(urn); } static void shared_msr_update(unsigned slot, u32 msr) @@ -425,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); @@ -564,7 +575,7 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -static bool pdptrs_changed(struct kvm_vcpu *vcpu) +bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; @@ -590,6 +601,7 @@ out: return changed; } +EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1409,7 +1421,7 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); + return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -1547,7 +1559,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); + kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) @@ -1555,7 +1567,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); + adjust_tsc_offset_guest(vcpu, adjustment); } #ifdef CONFIG_X86_64 @@ -1724,18 +1736,23 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static u64 __get_kvmclock_ns(struct kvm *kvm) { - struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); struct kvm_arch *ka = &kvm->arch; - s64 ns; + struct pvclock_vcpu_time_info hv_clock; - if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { - u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); - } else { - ns = ktime_get_boot_ns() + ka->kvmclock_offset; + spin_lock(&ka->pvclock_gtod_sync_lock); + if (!ka->use_master_clock) { + spin_unlock(&ka->pvclock_gtod_sync_lock); + return ktime_get_boot_ns() + ka->kvmclock_offset; } - return ns; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + spin_unlock(&ka->pvclock_gtod_sync_lock); + + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + return __pvclock_read_cycles(&hv_clock, rdtsc()); } u64 get_kvmclock_ns(struct kvm *kvm) @@ -2057,6 +2074,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; + vcpu->arch.st.steal.preempted = 0; + if (vcpu->arch.st.steal.version & 1) vcpu->arch.st.steal.version += 1; /* first time write, random junk */ @@ -2162,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); @@ -2184,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - gpa_offset = data & ~(PAGE_MASK | 1); - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) @@ -2262,7 +2278,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -2280,11 +2296,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; } @@ -2492,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", + msr_info->index); return 1; } else { vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); @@ -2596,7 +2613,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_XEN_HVM: - case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: @@ -2623,6 +2639,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif r = 1; break; + case KVM_CAP_ADJUST_CLOCK: + r = KVM_CLOCK_TSC_STABLE; + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, @@ -2794,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (kvm_lapic_hv_timer_in_use(vcpu) && kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_get_lapic_target_expiration_tsc(vcpu))) kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update @@ -2810,8 +2829,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } +static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + vcpu->arch.st.steal.preempted = 1; + + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal.preempted, + offsetof(struct kvm_steal_time, preempted), + sizeof(vcpu->arch.st.steal.preempted)); +} + void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_steal_time_set_preempted(vcpu); kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -3415,6 +3448,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, }; case KVM_SET_VAPIC_ADDR: { struct kvm_vapic_addr va; + int idx; r = -EINVAL; if (!lapic_in_kernel(vcpu)) @@ -3422,7 +3456,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) goto out; + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_X86_SETUP_MCE: { @@ -4103,9 +4139,11 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - now_ns = get_kvmclock_ns(kvm); + local_irq_disable(); + now_ns = __get_kvmclock_ns(kvm); user_ns.clock = now_ns; - user_ns.flags = 0; + user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; + local_irq_enable(); memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; @@ -4795,7 +4833,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) +static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4815,8 +4853,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_emulate_wbinvd_noskip(vcpu); + kvm_emulate_wbinvd_noskip(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -5060,11 +5098,6 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); kvm_load_guest_fpu(emul_to_vcpu(ctxt)); - /* - * CR0.TS may reference the host fpu state, not the guest fpu state, - * so it may be clear at this point. - */ - clts(); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) @@ -5419,7 +5452,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never @@ -5432,6 +5464,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag } } +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int r = EMULATE_DONE; + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + return r == EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); + static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && @@ -5617,6 +5660,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); +static int complete_fast_pio_in(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + /* We should only ever be called with arch.pio.count equal to 1 */ + BUG_ON(vcpu->arch.pio.count != 1); + + /* For size less than 4 we merge, else we zero extend */ + val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) + : 0; + + /* + * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * the copy and tracing + */ + emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + + return 1; +} + +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val; + int ret; + + /* For size less than 4 we merge, else we zero extend */ + val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + + ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, + &val, 1); + if (ret) { + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + return ret; + } + + vcpu->arch.complete_userspace_io = complete_fast_pio_in; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); @@ -5733,13 +5819,13 @@ static int kvmclock_cpu_online(unsigned int cpu) static void kvm_timer_init(void) { - int cpu; - max_tsc_khz = tsc_khz; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; + int cpu; + memset(&policy, 0, sizeof(policy)); cpu = get_cpu(); cpufreq_get_policy(&policy, cpu); @@ -5966,8 +6052,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_vcpu_halt(vcpu); + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_vcpu_halt(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -5998,9 +6088,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; + int op_64_bit, r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_skip_emulated_instruction(vcpu); if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -7386,34 +7476,24 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; + if (!vcpu->guest_fpu_loaded) return; - } vcpu->guest_fpu_loaded = 0; copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } trace_kvm_fpu(0); } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + kvmclock_reset(vcpu); - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kvm_x86_ops->vcpu_free(vcpu); + free_cpumask_var(wbinvd_dirty_mask); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, @@ -8153,7 +8233,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_page_track_flush_slot(kvm, slot); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 25da5bc8d83d..4ca0d78adcf0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -497,38 +497,24 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * a whole series of functions like read_cr0() and write_cr0(). * * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but Linux only really cares about one: the horrifically-named Task - * Switched (TS) bit at bit 3 (ie. 8) + * features, but the only cr0 bit that Linux ever used at runtime was the + * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch, and Linux uses that gratefully, but wouldn't a - * name like "FPUTRAP bit" be a little less cryptic? + * lazily after a task switch if we wanted to, but wouldn't a name like + * "FPUTRAP bit" be a little less cryptic? * - * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. + * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore + * cr0. */ -static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { - lazy_hcall1(LHCALL_TS, val & X86_CR0_TS); - current_cr0 = val; } static unsigned long lguest_read_cr0(void) { - return current_cr0; -} - -/* - * Intel provided a special instruction to clear the TS bit for people too cool - * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. - */ -static void lguest_clts(void) -{ - lazy_hcall1(LHCALL_TS, 0); - current_cr0 &= ~X86_CR0_TS; + return 0; } /* @@ -1432,7 +1418,6 @@ __init void lguest_init(void) pv_cpu_ops.load_tls = lguest_load_tls; pv_cpu_ops.get_debugreg = lguest_get_debugreg; pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; pv_cpu_ops.write_cr0 = lguest_write_cr0; pv_cpu_ops.read_cr4 = lguest_read_cr4; diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b48f88..c5959576c315 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,53 +16,6 @@ #include <asm/smap.h> #include <asm/export.h> -/* Standard copy_to_user with segment limit checking */ -ENTRY(_copy_to_user) - mov PER_CPU_VAR(current_task), %rax - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_to_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_to_user) -EXPORT_SYMBOL(_copy_to_user) - -/* Standard copy_from_user with segment limit checking */ -ENTRY(_copy_from_user) - mov PER_CPU_VAR(current_task), %rax - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_from_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_from_user) -EXPORT_SYMBOL(_copy_from_user) - - - .section .fixup,"ax" - /* must zero dest */ -ENTRY(bad_from_user) -bad_from_user: - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret -ENDPROC(bad_from_user) - .previous - /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index d1dee753b949..07764255b611 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -113,14 +113,14 @@ int msr_clear_bit(u32 msr, u8 bit) } #ifdef CONFIG_TRACEPOINTS -void do_trace_write_msr(unsigned msr, u64 val, int failed) +void do_trace_write_msr(unsigned int msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); -void do_trace_read_msr(unsigned msr, u64 val, int failed) +void do_trace_read_msr(unsigned int msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index b4908789484e..c074799bddae 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -34,3 +34,52 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +/** + * copy_to_user: - Copy a block of data into user space. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from kernel space to user space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(_copy_to_user); + +/** + * copy_from_user: - Copy a block of data from user space. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from user space to kernel space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + */ +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) +{ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} +EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3bc7baf2a711..0b281217c890 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -640,52 +640,3 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 79ae939970d3..fcd06f7526de 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -135,7 +135,12 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) if (early_recursion_flag > 2) goto halt_loop; - if (regs->cs != __KERNEL_CS) + /* + * Old CPUs leave the high bits of CS on the stack + * undefined. I'm not sure which CPUs do this, but at least + * the 486 DX works this way. + */ + if ((regs->cs & 0xFFFF) != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..e3254ca0eec4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,7 +413,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* @@ -679,8 +679,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); - printk(KERN_ALERT "IP:"); - printk_address(regs->ip); + printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd71d9e8..963895f9af7f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -89,10 +89,10 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* - * When memory was added/removed make sure all the processes MM have + * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end, int removed) +void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long address; @@ -100,12 +100,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - /* - * When it is called after memory hot remove, pgd_none() - * returns true. In this case (removed == 1), we must clear - * the PGD entries in the local PGD level page. - */ - if (pgd_none(*pgd_ref) && !removed) + if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); @@ -122,13 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - if (removed) { - if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) - pgd_clear(pgd); - } else { - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - } + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); } @@ -596,7 +586,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, } if (pgd_changed) - sync_global_pgds(vaddr_start, vaddr_end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1); __flush_tlb_all(); @@ -1239,7 +1229,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1, 0); + sync_global_pgds(start, end - 1); return err; } diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ddd2661c4502..887e57182716 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -104,10 +104,10 @@ void __init kernel_randomize_memory(void) * consistent with the vaddr_start/vaddr_end variables. */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && vaddr_end >= EFI_VA_START); - BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || - config_enabled(CONFIG_EFI)) && + BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || + IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..324e5713d386 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -350,12 +350,12 @@ int mpx_enable_management(void) * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is * expected to be relatively expensive. Storing the bounds * directory here means that we do not have to do xsave in the - * unmap path; we can just use mm->bd_addr instead. + * unmap path; we can just use mm->context.bd_addr instead. */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); - mm->bd_addr = bd_base; - if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + mm->context.bd_addr = bd_base; + if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; up_write(&mm->mmap_sem); @@ -370,7 +370,7 @@ int mpx_disable_management(void) return -ENXIO; down_write(&mm->mmap_sem); - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; up_write(&mm->mmap_sem); return 0; } @@ -947,7 +947,7 @@ static int try_unmap_single_bt(struct mm_struct *mm, end = bta_end_vaddr; } - bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); ret = get_bt_addr(mm, bde_vaddr, &bt_addr); /* * No bounds table there, so nothing to unmap. diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 3f35b48d1d9d..12dcad7297a5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -19,7 +19,7 @@ #include "numa_internal.h" -int __initdata numa_off; +int numa_off; nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 170cc4ff057b..efc32bc6862b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -730,6 +730,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end) free_memtype(start, end); } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -972,20 +986,17 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return 0; } -int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { enum page_cache_mode pcm; if (!pat_enabled()) - return 0; + return; /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); - - return 0; } /* diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index f88ce0e5efd9..2dab69a706ec 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -141,8 +141,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where * we know the FPU regstiers are safe for use and we can use PKRU - * directly. The fact that PKRU is only available when we are - * using eagerfpu mode makes this possible. + * directly. */ void copy_init_pkru_to_fpregs(void) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 28c04123b6dd..ffdbc4836b4f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -339,10 +339,11 @@ fail: return 0; } -static void nmi_cpu_setup(void *dummy) +static void nmi_cpu_setup(void) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); raw_spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); @@ -369,7 +370,7 @@ static void nmi_cpu_restore_registers(struct op_msrs *msrs) } } -static void nmi_cpu_shutdown(void *dummy) +static void nmi_cpu_shutdown(void) { unsigned int v; int cpu = smp_processor_id(); @@ -387,20 +388,26 @@ static void nmi_cpu_shutdown(void *dummy) nmi_cpu_restore_registers(msrs); } -static void nmi_cpu_up(void *dummy) +static int nmi_cpu_online(unsigned int cpu) { + local_irq_disable(); if (nmi_enabled) - nmi_cpu_setup(dummy); + nmi_cpu_setup(); if (ctr_running) - nmi_cpu_start(dummy); + nmi_cpu_start(NULL); + local_irq_enable(); + return 0; } -static void nmi_cpu_down(void *dummy) +static int nmi_cpu_down_prep(unsigned int cpu) { + local_irq_disable(); if (ctr_running) - nmi_cpu_stop(dummy); + nmi_cpu_stop(NULL); if (nmi_enabled) - nmi_cpu_shutdown(dummy); + nmi_cpu_shutdown(); + local_irq_enable(); + return 0; } static int nmi_create_files(struct dentry *root) @@ -433,26 +440,7 @@ static int nmi_create_files(struct dentry *root) return 0; } -static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block oprofile_cpu_nb = { - .notifier_call = oprofile_cpu_notifier -}; +static enum cpuhp_state cpuhp_nmi_online; static int nmi_setup(void) { @@ -495,20 +483,17 @@ static int nmi_setup(void) if (err) goto fail; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' */ - get_online_cpus(); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); - on_each_cpu(nmi_cpu_setup, NULL, 1); - __register_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); - + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online", + nmi_cpu_online, nmi_cpu_down_prep); + if (err < 0) + goto fail_nmi; + cpuhp_nmi_online = err; return 0; +fail_nmi: + unregister_nmi_handler(NMI_LOCAL, "oprofile"); fail: free_msrs(); return err; @@ -518,17 +503,9 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ - get_online_cpus(); - on_each_cpu(nmi_cpu_shutdown, NULL, 1); + cpuhp_remove_state(cpuhp_nmi_online); nmi_enabled = 0; ctr_running = 0; - __unregister_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); /* make variables visible to the nmi handler: */ smp_mb(); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index c20d2cc7ef64..ae387e5ee6f7 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -327,35 +327,18 @@ static int __init early_root_info_init(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static int amd_bus_cpu_online(unsigned int cpu) { u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); if (!(reg & ENABLE_CF8_EXT_CFG)) { reg |= ENABLE_CF8_EXT_CFG; wrmsrl(MSR_AMD64_NB_CFG, reg); } + return 0; } -static int amd_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block amd_cpu_notifier = { - .notifier_call = amd_cpu_notify, -}; - static void __init pci_enable_pci_io_ecs(void) { #ifdef CONFIG_AMD_NB @@ -385,7 +368,7 @@ static void __init pci_enable_pci_io_ecs(void) static int __init pci_io_ecs_init(void) { - int cpu; + int ret; /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) @@ -395,12 +378,9 @@ static int __init pci_io_ecs_init(void) if (early_pci_allowed()) pci_enable_pci_io_ecs(); - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, - (void *)(long)cpu); - __register_cpu_notifier(&amd_cpu_notifier); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pci/amd_bus:online", + amd_bus_cpu_online, NULL); + WARN_ON(ret < 0); pci_probe |= PCI_HAS_IO_ECS; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..e1fb269c87af 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, - "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", + type == PCI_CAP_ID_MSI ? "" : "-X", irq); return irq; } diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b27bccd4390f..821cb41f00e6 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -89,7 +89,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) } static void ce4100_serial_fixup(int port, struct uart_port *up, - unsigned short *capabilites) + u32 *capabilites) { #ifdef CONFIG_EARLY_PRINTK /* diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index bf99aa7005eb..936a488d6cf6 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -861,7 +861,7 @@ static void __init __efi_enter_virtual_mode(void) int count = 0, pg_shift = 0; void *new_memmap = NULL; efi_status_t status; - phys_addr_t pa; + unsigned long pa; efi.systab = NULL; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 58b0f801f66f..319148bd4b05 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -31,6 +31,7 @@ #include <linux/io.h> #include <linux/reboot.h> #include <linux/slab.h> +#include <linux/ucs2_string.h> #include <asm/setup.h> #include <asm/page.h> @@ -211,6 +212,35 @@ void efi_sync_low_kernel_mappings(void) memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); } +/* + * Wrapper for slow_virt_to_phys() that handles NULL addresses. + */ +static inline phys_addr_t +virt_to_phys_or_null_size(void *va, unsigned long size) +{ + bool bad_size; + + if (!va) + return 0; + + if (virt_addr_valid(va)) + return virt_to_phys(va); + + /* + * A fully aligned variable on the stack is guaranteed not to + * cross a page bounary. Try to catch strings on the stack by + * checking that 'size' is a power of two. + */ + bad_size = size > PAGE_SIZE || !is_power_of_2(size); + + WARN_ON(!IS_ALIGNED((unsigned long)va, size) || bad_size); + + return slow_virt_to_phys(va); +} + +#define virt_to_phys_or_null(addr) \ + virt_to_phys_or_null_size((addr), sizeof(*(addr))) + int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long pfn, text; @@ -494,8 +524,8 @@ static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc) spin_lock(&rtc_lock); - phys_tm = virt_to_phys(tm); - phys_tc = virt_to_phys(tc); + phys_tm = virt_to_phys_or_null(tm); + phys_tc = virt_to_phys_or_null(tc); status = efi_thunk(get_time, phys_tm, phys_tc); @@ -511,7 +541,7 @@ static efi_status_t efi_thunk_set_time(efi_time_t *tm) spin_lock(&rtc_lock); - phys_tm = virt_to_phys(tm); + phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_time, phys_tm); @@ -529,9 +559,9 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, spin_lock(&rtc_lock); - phys_enabled = virt_to_phys(enabled); - phys_pending = virt_to_phys(pending); - phys_tm = virt_to_phys(tm); + phys_enabled = virt_to_phys_or_null(enabled); + phys_pending = virt_to_phys_or_null(pending); + phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(get_wakeup_time, phys_enabled, phys_pending, phys_tm); @@ -549,7 +579,7 @@ efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) spin_lock(&rtc_lock); - phys_tm = virt_to_phys(tm); + phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_wakeup_time, enabled, phys_tm); @@ -558,6 +588,10 @@ efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) return status; } +static unsigned long efi_name_size(efi_char16_t *name) +{ + return ucs2_strsize(name, EFI_VAR_NAME_LEN) + 1; +} static efi_status_t efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, @@ -567,11 +601,11 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, u32 phys_name, phys_vendor, phys_attr; u32 phys_data_size, phys_data; - phys_data_size = virt_to_phys(data_size); - phys_vendor = virt_to_phys(vendor); - phys_name = virt_to_phys(name); - phys_attr = virt_to_phys(attr); - phys_data = virt_to_phys(data); + phys_data_size = virt_to_phys_or_null(data_size); + phys_vendor = virt_to_phys_or_null(vendor); + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); + phys_attr = virt_to_phys_or_null(attr); + phys_data = virt_to_phys_or_null_size(data, *data_size); status = efi_thunk(get_variable, phys_name, phys_vendor, phys_attr, phys_data_size, phys_data); @@ -586,9 +620,9 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, u32 phys_name, phys_vendor, phys_data; efi_status_t status; - phys_name = virt_to_phys(name); - phys_vendor = virt_to_phys(vendor); - phys_data = virt_to_phys(data); + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); + phys_vendor = virt_to_phys_or_null(vendor); + phys_data = virt_to_phys_or_null_size(data, data_size); /* If data_size is > sizeof(u32) we've got problems */ status = efi_thunk(set_variable, phys_name, phys_vendor, @@ -605,9 +639,9 @@ efi_thunk_get_next_variable(unsigned long *name_size, efi_status_t status; u32 phys_name_size, phys_name, phys_vendor; - phys_name_size = virt_to_phys(name_size); - phys_vendor = virt_to_phys(vendor); - phys_name = virt_to_phys(name); + phys_name_size = virt_to_phys_or_null(name_size); + phys_vendor = virt_to_phys_or_null(vendor); + phys_name = virt_to_phys_or_null_size(name, *name_size); status = efi_thunk(get_next_variable, phys_name_size, phys_name, phys_vendor); @@ -621,7 +655,7 @@ efi_thunk_get_next_high_mono_count(u32 *count) efi_status_t status; u32 phys_count; - phys_count = virt_to_phys(count); + phys_count = virt_to_phys_or_null(count); status = efi_thunk(get_next_high_mono_count, phys_count); return status; @@ -633,7 +667,7 @@ efi_thunk_reset_system(int reset_type, efi_status_t status, { u32 phys_data; - phys_data = virt_to_phys(data); + phys_data = virt_to_phys_or_null_size(data, data_size); efi_thunk(reset_system, reset_type, status, data_size, phys_data); } @@ -661,9 +695,9 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - phys_storage = virt_to_phys(storage_space); - phys_remaining = virt_to_phys(remaining_space); - phys_max = virt_to_phys(max_variable_size); + phys_storage = virt_to_phys_or_null(storage_space); + phys_remaining = virt_to_phys_or_null(remaining_space); + phys_max = virt_to_phys_or_null(max_variable_size); status = efi_thunk(query_variable_info, attr, phys_storage, phys_remaining, phys_max); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 429d08be7848..dd6cfa4ad3ac 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -28,4 +28,4 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o -obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o +obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index de734134bc8d..3f1f1c77d090 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -1,5 +1,5 @@ /* - * platform_wdt.c: Watchdog platform library file + * Intel Merrifield watchdog platform device library file * * (C) Copyright 2014 Intel Corporation * Author: David Cohen <david.a.cohen@linux.intel.com> @@ -14,7 +14,9 @@ #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/platform_data/intel-mid_wdt.h> + #include <asm/intel-mid.h> +#include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> #define TANGIER_EXT_TIMER0_MSI 15 @@ -50,14 +52,34 @@ static struct intel_mid_wdt_pdata tangier_pdata = { .probe = tangier_probe, }; -static int __init register_mid_wdt(void) +static int wdt_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) { - if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) { - wdt_dev.dev.platform_data = &tangier_pdata; - return platform_device_register(&wdt_dev); + if (code == SCU_DOWN) { + platform_device_unregister(&wdt_dev); + return 0; } - return -ENODEV; + return platform_device_register(&wdt_dev); } +static struct notifier_block wdt_scu_notifier = { + .notifier_call = wdt_scu_status_change, +}; + +static int __init register_mid_wdt(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + wdt_dev.dev.platform_data = &tangier_pdata; + + /* + * We need to be sure that the SCU IPC is ready before watchdog device + * can be registered: + */ + intel_scu_notifier_add(&wdt_scu_notifier); + + return 0; +} rootfs_initcall(register_mid_wdt); diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 1eb47b6298c2..e793fe509971 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void) fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index 59253db41bbc..e0607c77a1bd 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void) pr_debug("Setting lapic_timer_frequency = %d\n", lapic_timer_frequency); - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 5d3b45ad1c03..ef03852ea6e8 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -270,7 +270,25 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) return 0; } -EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); + +pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev) +{ + struct mid_pwr *pwr = midpwr; + int id, reg, bit; + u32 power; + + if (!pwr || !pwr->available) + return PCI_UNKNOWN; + + id = intel_mid_pwr_get_lss_id(pdev); + if (id < 0) + return PCI_UNKNOWN; + + reg = (id * LSS_PWS_BITS) / 32; + bit = (id * LSS_PWS_BITS) % 32; + power = mid_pwr_get_state(pwr, reg); + return (__force pci_power_t)((power >> bit) & 3); +} void intel_mid_pwr_power_off(void) { diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 55130846ac87..c0533fbc39e3 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -196,6 +196,7 @@ static int xo15_sci_remove(struct acpi_device *device) return 0; } +#ifdef CONFIG_PM_SLEEP static int xo15_sci_resume(struct device *dev) { /* Enable all EC events */ @@ -207,6 +208,7 @@ static int xo15_sci_resume(struct device *dev) return 0; } +#endif static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume); diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index b4d5e95fe4df..4a6a5a26c582 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -40,7 +40,15 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); + /* + * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI + * callback method, which uses efi_call() directly, with the kernel page tables: + */ + if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags))) + ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); + else + ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); + return ret; } EXPORT_SYMBOL_GPL(uv_bios_call); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 9e42842e924a..766d4d3529a1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -19,7 +19,6 @@ #include <asm/uv/uv_hub.h> #include <asm/uv/uv_bau.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/tsc.h> #include <asm/irq_vectors.h> #include <asm/timer.h> diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index cd5173a2733f..8410e7d0a5b5 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -387,8 +387,8 @@ static void uv_nmi_dump_cpu_ip_hdr(void) /* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); - printk_address(regs->ip); + pr_info("UV: %4d %6d %-32.32s %pS", + cpu, current->pid, current->comm, (void *)regs->ip); } /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 53cace2ec0e2..66ade16c7693 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> #include <asm/init.h> #include <asm/proto.h> @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index ac58c1616408..555b9fa0ad43 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -16,6 +16,7 @@ KCOV_INSTRUMENT := n KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large KBUILD_CFLAGS += -m$(BITS) +KBUILD_CFLAGS += $(call cc-option,-fno-PIE) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1ac76479c266..8730c2882fff 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -275,6 +275,8 @@ static void do_inject(void) unsigned int cpu = i_mce.extcpu; u8 b = i_mce.bank; + rdtscll(i_mce.tsc); + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 25012abc3409..4463fa72db94 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -69,7 +69,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index ba70ff232917..1972565ab106 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -269,7 +269,8 @@ int main(int argc, char **argv) insns++; } - fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + fprintf((errors) ? stderr : stdout, + "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, (errors) ? "Failure" : "Success", insns, diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index f59590645b68..1d23bf953a4a 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -16,7 +16,7 @@ #include <regex.h> #include <tools/le_byteshift.h> -void die(char *fmt, ...); +void die(char *fmt, ...) __attribute__((noreturn)); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 56f04db0c9c0..ecf31e0358c8 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) fprintf(stderr, "Warning: decoded and checked %d" " instructions with %d warnings\n", insns, warnings); else - fprintf(stderr, "Succeed: decoded and checked %d" + fprintf(stdout, "Success: decoded and checked %d" " instructions\n", insns); return 0; } diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 233ee09c1ce8..c77db2288982 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -26,7 +26,6 @@ static inline void rep_nop(void) } #define cpu_relax() rep_nop() -#define cpu_relax_lowlatency() cpu_relax() #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0fdd57da7aa..ced7027b3fbc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -980,17 +980,6 @@ static void xen_io_delay(void) { } -static void xen_clts(void) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - - MULTI_fpu_taskswitch(mcs.mc, 0); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - static DEFINE_PER_CPU(unsigned long, xen_cr0_value); static unsigned long xen_read_cr0(void) @@ -1233,8 +1222,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = xen_clts, - .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, @@ -1837,6 +1824,7 @@ static void __init init_hvm_pv_info(void) xen_domain_type = XEN_HVM_DOMAIN; } +#endif static int xen_cpu_up_prepare(unsigned int cpu) { @@ -1887,6 +1875,7 @@ static int xen_cpu_up_online(unsigned int cpu) return 0; } +#ifdef CONFIG_XEN_PVHVM #ifdef CONFIG_KEXEC_CORE static void xen_hvm_shutdown(void) { diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 0e98e5d241d0..a9fafb5c8738 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -19,7 +19,6 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..8c394e30e5fe 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820MAX] __initdata; +static struct e820entry xen_e820_map[E820_X_MAX] __initdata; static u32 xen_e820_map_entries __initdata; /* @@ -750,7 +750,7 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? @@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void) int i; int rc; - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 9fa27ceeecfd..311acad7dad2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -87,12 +87,6 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); - /* - * identify_cpu() may have set logical_pkg_id to -1 due - * to incorrect phys_proc_id. Let's re-comupte it. - */ - topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu); - xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3d6e0064cbfc..e8a9ea7d7a21 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,6 +114,7 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); /* * Our init of PV spinlocks is split in two init functions due to us @@ -137,6 +138,7 @@ void __init xen_init_spinlocks(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = xen_qlock_wait; pv_lock_ops.kick = xen_qlock_kick; + pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } /* |