diff options
Diffstat (limited to 'arch/x86')
345 files changed, 14433 insertions, 6797 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96d058a87100..ab2ed5328f0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,12 +24,14 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -82,6 +84,8 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_MMAP_RND_BITS if MMU + select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK @@ -96,7 +100,6 @@ config X86 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -177,12 +180,23 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config HAVE_LATENCYTOP_SUPPORT - def_bool y - config MMU def_bool y +config ARCH_MMAP_RND_BITS_MIN + default 28 if 64BIT + default 8 + +config ARCH_MMAP_RND_BITS_MAX + default 32 if 64BIT + default 16 + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + default 8 + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + default 16 + config SBUS bool @@ -349,6 +363,17 @@ config X86_FEATURE_NAMES If in doubt, say Y. +config X86_FAST_FEATURE_TESTS + bool "Fast CPU feature tests" if EMBEDDED + default y + ---help--- + Some fast-paths in the kernel depend on the capabilities of the CPU. + Say Y here for the kernel to patch in the appropriate code at runtime + based on the capabilities of the CPU. The infrastructure for patching + code at runtime takes up some additional space; space-constrained + embedded systems may wish to say N here to produce smaller, slightly + slower code. + config X86_X2APIC bool "Support x2apic" depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) @@ -450,6 +475,7 @@ config X86_UV depends on X86_64 depends on X86_EXTENDED_PLATFORM depends on NUMA + depends on EFI depends on X86_X2APIC depends on PCI ---help--- @@ -484,11 +510,10 @@ config X86_INTEL_CE config X86_INTEL_MID bool "Intel MID platform support" - depends on X86_32 depends on X86_EXTENDED_PLATFORM depends on X86_PLATFORM_DEVICES depends on PCI - depends on PCI_GOANY + depends on X86_64 || (PCI_GOANY && X86_32) depends on X86_IO_APIC select SFI select I2C @@ -523,9 +548,10 @@ config X86_INTEL_QUARK config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" - depends on ACPI + depends on X86 && ACPI select COMMON_CLK select PINCTRL + select IOSF_MBI ---help--- Select to build support for Intel Low Power Subsystem such as found on Intel Lynxpoint PCH. Selecting this option enables @@ -687,6 +713,14 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. +config QUEUED_LOCK_STAT + bool "Paravirt queued spinlock statistics" + depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS + ---help--- + Enable the collection of statistical data on the slowpath + behavior of paravirtualized queued spinlocks and report + them on debugfs. + source "arch/x86/xen/Kconfig" config KVM_GUEST @@ -1123,8 +1157,10 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "CPU microcode loading support" + bool "CPU microcode loading support" + default y depends on CPU_SUP_AMD || CPU_SUP_INTEL + depends on BLK_DEV_INITRD select FW_LOADER ---help--- @@ -1166,24 +1202,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config MICROCODE_INTEL_EARLY - bool - -config MICROCODE_AMD_EARLY - bool - -config MICROCODE_EARLY - bool "Early load microcode" - depends on MICROCODE=y && BLK_DEV_INITRD - select MICROCODE_INTEL_EARLY if MICROCODE_INTEL - select MICROCODE_AMD_EARLY if MICROCODE_AMD - default y - help - This option provides functionality to read additional microcode data - at the beginning of initrd image. The data tells kernel to load - microcode to CPU's as early as possible. No functional change if no - microcode data is glued to the initrd, therefore it's safe to say Y. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@ -2043,6 +2061,55 @@ config COMPAT_VDSO If unsure, say N: if you are compiling your own kernel, you are unlikely to be using a buggy version of glibc. +choice + prompt "vsyscall table for legacy applications" + depends on X86_64 + default LEGACY_VSYSCALL_EMULATE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in + kernel space. Since this location is not randomized with ASLR, + it can be used to assist security vulnerability exploitation. + + This setting can be changed at boot time via the kernel command + line parameter vsyscall=[native|emulate|none]. + + On a system with recent enough glibc (2.14 or newer) and no + static binaries, you can say None without a performance penalty + to improve security. + + If unsure, select "Emulate". + + config LEGACY_VSYSCALL_NATIVE + bool "Native" + help + Actual executable code is located in the fixed vsyscall + address mapping, implementing time() efficiently. Since + this makes the mapping executable, it can be used during + security vulnerability exploitation (traditionally as + ROP gadgets). This configuration is not recommended. + + config LEGACY_VSYSCALL_EMULATE + bool "Emulate" + help + The kernel traps and emulates calls into the fixed + vsyscall address mapping. This makes the mapping + non-executable, but it still contains known contents, + which could be used in certain rare security vulnerability + exploits. This configuration is recommended when userspace + still uses the vsyscall area. + + config LEGACY_VSYSCALL_NONE + bool "None" + help + There will be no vsyscall mapping at all. This will + eliminate any risk of ASLR bypass due to the vsyscall + fixed address mapping. Attempts to use the vsyscalls + will be reported to dmesg, so that either old or + malicious userspace programs can be identified. + +endchoice + config CMDLINE_BOOL bool "Built-in kernel command line" ---help--- @@ -2632,6 +2699,19 @@ config PMC_ATOM def_bool y depends on PCI +config VMD + depends on PCI_MSI + tristate "Volume Management Device Driver" + default N + ---help--- + Adds support for the Intel Volume Management Device (VMD). VMD is a + secondary PCI host bridge that allows PCI Express root ports, + and devices attached to them, to be removed from the default + PCI domain and placed within the VMD domain. This provides + more bus resources than are otherwise possible with a + single domain. If you know your system provides one of these and + has devices attached to it, say Y; if you are not sure, say N. + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6983314c8b37..3ba5ff2f2d08 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -3,10 +3,6 @@ choice prompt "Processor family" default M686 if X86_32 default GENERIC_CPU if X86_64 - -config M486 - bool "486" - depends on X86_32 ---help--- This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -23,9 +19,9 @@ config M486 Here are the settings recommended for greatest speed: - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or - SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - "586" for generic Pentium CPUs lacking the TSC - (time stamp counter) register. + (time stamp counter) register. - "Pentium-Classic" for the Intel Pentium. - "Pentium-MMX" for the Intel Pentium MMX. - "Pentium-Pro" for the Intel Pentium Pro. @@ -34,17 +30,31 @@ config M486 - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). + - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. - "Winchip-2" for IDT Winchips with 3dNow! capabilities. + - "AMD Elan" for the 32-bit AMD Elan embedded CPU. - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. + - "Intel P4" for the Pentium 4/Netburst microarchitecture. + - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. + - "Intel Atom" for the Atom-microarchitecture CPUs. + - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. + + See each option's help text for additional details. If you don't know + what to do, choose "486". - If you don't know what to do, choose "486". +config M486 + bool "486" + depends on X86_32 + ---help--- + Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel + 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. config M586 bool "586/K5/5x86/6x86/6x86MX" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8c0d3266173..9b18ed97a8a2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. Note that with PAT support - enabled, even in this case there are restrictions on /dev/mem - use due to the cache aliasing requirements. - - If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. - - If in doubt, say Y. - config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y @@ -65,10 +48,14 @@ config EARLY_PRINTK_EFI This is useful for kernel debugging when your machine crashes very early before the console code is initialized. +config X86_PTDUMP_CORE + def_bool n + config X86_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" + tristate "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL select DEBUG_FS + select X86_PTDUMP_CORE ---help--- Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers @@ -79,7 +66,8 @@ config X86_PTDUMP config EFI_PGT_DUMP bool "Dump the EFI pagetable" - depends on EFI && X86_PTDUMP + depends on EFI + select X86_PTDUMP_CORE ---help--- Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous @@ -105,6 +93,34 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on DEBUG_RODATA + select X86_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + x86/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 747860c696e1..4086abca0b32 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -159,15 +159,23 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp +# do binutils support CFI? +cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) +# is .cfi_signal_frame supported too? +cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) +cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) + # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) +sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1) +sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1) -KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d553e54171b..bbe1a62efc02 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,13 +9,13 @@ # Changed by many, many contributors over the years. # +KASAN_SANITIZE := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. # The number is the same as you would ordinarily press at bootup. -KASAN_SANITIZE := n - SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage @@ -60,6 +60,7 @@ clean-files += cpustr.h KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 0033e96c3f09..9011a88353de 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,7 +23,6 @@ #include <stdarg.h> #include <linux/types.h> #include <linux/edd.h> -#include <asm/boot.h> #include <asm/setup.h> #include "bitops.h" #include "ctype.h" diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0a291cdfaf77..f9ce75d80101 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE :=n LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index db51c1f27446..583d539a4197 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -624,7 +624,7 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, static efi_status_t __gop_query32(struct efi_graphics_output_protocol_32 *gop32, struct efi_graphics_output_mode_info **info, - unsigned long *size, u32 *fb_base) + unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_32 *mode; efi_status_t status; @@ -650,7 +650,8 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, unsigned long nr_gops; u16 width, height; u32 pixels_per_scan_line; - u32 fb_base; + u32 ext_lfb_base; + u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; efi_status_t status; @@ -667,7 +668,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u32 h = handles[i]; - u32 current_fb_base; + u64 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop32); @@ -715,6 +716,13 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + si->pages = 1; setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); @@ -729,7 +737,7 @@ out: static efi_status_t __gop_query64(struct efi_graphics_output_protocol_64 *gop64, struct efi_graphics_output_mode_info **info, - unsigned long *size, u32 *fb_base) + unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_64 *mode; efi_status_t status; @@ -755,7 +763,8 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, unsigned long nr_gops; u16 width, height; u32 pixels_per_scan_line; - u32 fb_base; + u32 ext_lfb_base; + u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; efi_status_t status; @@ -772,7 +781,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u64 h = handles[i]; - u32 current_fb_base; + u64 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop64); @@ -820,6 +829,13 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + si->pages = 1; setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2d6b309c8e9a..6236b9ec4b76 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -154,7 +154,7 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long CONFIG_PHYSICAL_ALIGN # SectionAlignment + .long 0x20 # SectionAlignment .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index aa8a96b052e3..95c7a818c0ed 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c @@ -19,6 +19,8 @@ #include "video.h" #include "vesa.h" +#include <uapi/asm/boot.h> + /* * Common variables */ diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 05111bb8d018..77780e386e9b 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -13,6 +13,8 @@ * Select video mode */ +#include <uapi/asm/boot.h> + #include "boot.h" #include "video.h" #include "vesa.h" diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9a2838cf0591..b9b912a44d61 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -5,6 +5,8 @@ avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) +sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o poly1305-x86_64-y += poly1305-avx2-x86_64.o endif +ifeq ($(sha1_ni_supported),yes) +sha1-ssse3-y += sha1_ni_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +ifeq ($(sha256_ni_supported),yes) +sha256-ssse3-y += sha256_ni_asm.o +endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 4c65c70e628b..d84456924563 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -567,7 +567,8 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index bacaa13acac5..93d8f295784e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -559,7 +559,8 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index be00aa48b2b5..8648158f3916 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -469,7 +469,8 @@ static int __init cast5_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 5dbba7224221..fca459578c35 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -591,7 +591,8 @@ static int __init cast6_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 712b13047b41..3a33124e9112 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -157,7 +157,9 @@ ENTRY(chacha20_4block_xor_ssse3) # done with the slightly better performing SSSE3 byte shuffling, # 7/12-bit word rotation uses traditional shift+OR. - sub $0x40,%rsp + mov %rsp,%r11 + sub $0x80,%rsp + and $~63,%rsp # x0..15[0-3] = s0..3[0..3] movq 0x00(%rdi),%xmm1 @@ -620,6 +622,6 @@ ENTRY(chacha20_4block_xor_ssse3) pxor %xmm1,%xmm15 movdqu %xmm15,0xf0(%rsi) - add $0x40,%rsp + mov %r11,%rsp ret ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index effe2160b7c5..8baaff5af0b5 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -125,12 +125,12 @@ static struct crypto_alg alg = { static int __init chacha20_simd_mod_init(void) { - if (!cpu_has_ssse3) + if (!boot_cpu_has(X86_FEATURE_SSSE3)) return -ENODEV; #ifdef CONFIG_AS_AVX2 chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && - cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); } diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 81a595d75cf5..0e9871693f24 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -257,7 +257,7 @@ static int __init crc32c_intel_mod_init(void) if (!x86_match_cpu(crc32c_cpu_id)) return -ENODEV; #ifdef CONFIG_X86_64 - if (cpu_has_pclmulqdq) { + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 225be06edc80..4fe27e074194 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -330,7 +330,7 @@ ENDPROC(crc_pcl) ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.section .rotata, "a", %progbits +.section .rodata, "a", %progbits .align 8 K_table: .long 0x493c7d27, 0x00000001 diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 440df0c7a2ee..a69321a77783 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -219,6 +219,29 @@ static int ghash_async_final(struct ahash_request *req) } } +static int ghash_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + ghash_async_init(req); + memcpy(dctx, in, sizeof(*dctx)); + return 0; + +} + +static int ghash_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memcpy(out, dctx, sizeof(*dctx)); + return 0; + +} + static int ghash_async_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); @@ -288,8 +311,11 @@ static struct ahash_alg ghash_async_alg = { .final = ghash_async_final, .setkey = ghash_async_setkey, .digest = ghash_async_digest, + .export = ghash_async_export, + .import = ghash_async_import, .halg = { .digestsize = GHASH_DIGEST_SIZE, + .statesize = sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-clmulni", diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index f7170d764f32..4264a3d59589 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -184,7 +184,7 @@ static int __init poly1305_simd_mod_init(void) #ifdef CONFIG_AS_AVX2 poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && - cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) alg.descsize += 10 * sizeof(u32); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 7d838dc4d888..6d198342e2de 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -542,7 +542,8 @@ static int __init init(void) pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index da7dafc9b16d..5dc37026c7ce 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -597,7 +597,8 @@ static int __init serpent_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S new file mode 100644 index 000000000000..874a651b9e7d --- /dev/null +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -0,0 +1,302 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley <sean.m.gulley@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define RSPSAVE %rax + +/* gcc conversion */ +#define FRAME_SIZE 32 /* space for 2x16 bytes */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha1_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks) + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ +.text +.align 32 +ENTRY(sha1_ni_transform) + mov %rsp, RSPSAVE + sub $FRAME_SIZE, %rsp + and $~0xF, %rsp + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* load initial hash values */ + pinsrd $3, 1*16(DIGEST_PTR), E0 + movdqu 0*16(DIGEST_PTR), ABCD + pand UPPER_WORD_MASK(%rip), E0 + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa E0, (0*16)(%rsp) + movdqa ABCD, (1*16)(%rsp) + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + pxor MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte (0*16)(%rsp), E0 + paddd (1*16)(%rsp), ABCD + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, 0*16(DIGEST_PTR) + pextrd $3, E0, 1*16(DIGEST_PTR) + +.Ldone_hash: + mov RSPSAVE, %rsp + + ret +ENDPROC(sha1_ni_transform) + +.data + +.align 64 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f +UPPER_WORD_MASK: + .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7c48e8b20848..dd14616b7739 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -31,24 +31,11 @@ #include <crypto/sha1_base.h> #include <asm/fpu/api.h> +typedef void (sha1_transform_fn)(u32 *digest, const char *data, + unsigned int rounds); -asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, - unsigned int rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha1_transform_avx(u32 *digest, const char *data, - unsigned int rounds); -#endif -#ifdef CONFIG_AS_AVX2 -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); -#endif - -static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_transform_fn *sha1_xform) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -61,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return 0; } -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) { if (!irq_fpu_usable()) return crypto_sha1_finup(desc, data, len, out); @@ -76,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - return sha1_ssse3_finup(desc, NULL, 0, out); + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_ssse3); } -#ifdef CONFIG_AS_AVX2 -static void sha1_apply_transform_avx2(u32 *digest, const char *data, - unsigned int rounds) +static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - /* Select the optimal transform based on data block size */ - if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(digest, data, rounds); - else - sha1_transform_avx(digest, data, rounds); + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_ssse3); +} + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ssse3_finup(desc, NULL, 0, out); } -#endif -static struct shash_alg alg = { +static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, @@ -110,7 +102,7 @@ static struct shash_alg alg = { .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", - .cra_driver_name= "sha1-ssse3", + .cra_driver_name = "sha1-ssse3", .cra_priority = 150, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, @@ -118,10 +110,62 @@ static struct shash_alg alg = { } }; +static int register_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shash(&sha1_ssse3_alg); + return 0; +} + +static void unregister_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shash(&sha1_ssse3_alg); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_avx); +} + +static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_avx); +} + +static int sha1_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx_update, + .final = sha1_avx_final, + .finup = sha1_avx_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool avx_usable(void) +{ + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; @@ -130,55 +174,197 @@ static bool __init avx_usable(void) return true; } -#ifdef CONFIG_AS_AVX2 -static bool __init avx2_usable(void) +static int register_sha1_avx(void) +{ + if (avx_usable()) + return crypto_register_shash(&sha1_avx_alg); + return 0; +} + +static void unregister_sha1_avx(void) { - if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && - boot_cpu_has(X86_FEATURE_BMI2)) + if (avx_usable()) + crypto_unregister_shash(&sha1_avx_alg); +} + +#else /* CONFIG_AS_AVX */ +static inline int register_sha1_avx(void) { return 0; } +static inline void unregister_sha1_avx(void) { } +#endif /* CONFIG_AS_AVX */ + + +#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + && boot_cpu_has(X86_FEATURE_BMI1) + && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } + +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} + +static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} + +static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} + +static int sha1_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx2_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx2_update, + .final = sha1_avx2_final, + .finup = sha1_avx2_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int register_sha1_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shash(&sha1_avx2_alg); + return 0; +} + +static void unregister_sha1_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shash(&sha1_avx2_alg); +} + +#else +static inline int register_sha1_avx2(void) { return 0; } +static inline void unregister_sha1_avx2(void) { } #endif + +#ifdef CONFIG_AS_SHA1_NI +asmlinkage void sha1_ni_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_ni_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_ni_update, + .final = sha1_ni_final, + .finup = sha1_ni_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int register_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shash(&sha1_ni_alg); + return 0; +} + +static void unregister_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shash(&sha1_ni_alg); +} + +#else +static inline int register_sha1_ni(void) { return 0; } +static inline void unregister_sha1_ni(void) { } #endif static int __init sha1_ssse3_mod_init(void) { - char *algo_name; + if (register_sha1_ssse3()) + goto fail; - /* test for SSSE3 first */ - if (cpu_has_ssse3) { - sha1_transform_asm = sha1_transform_ssse3; - algo_name = "SSSE3"; + if (register_sha1_avx()) { + unregister_sha1_ssse3(); + goto fail; } -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { - sha1_transform_asm = sha1_transform_avx; - algo_name = "AVX"; -#ifdef CONFIG_AS_AVX2 - /* allow AVX2 to override AVX, it's a little faster */ - if (avx2_usable()) { - sha1_transform_asm = sha1_apply_transform_avx2; - algo_name = "AVX2"; - } -#endif + if (register_sha1_avx2()) { + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; } -#endif - if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", algo_name); - return crypto_register_shash(&alg); + if (register_sha1_ni()) { + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; } - pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); + return 0; +fail: return -ENODEV; } static void __exit sha1_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + unregister_sha1_ni(); + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); } module_init(sha1_ssse3_mod_init); diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S new file mode 100644 index 000000000000..748cdf21a938 --- /dev/null +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -0,0 +1,353 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley <sean.m.gulley@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha256_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks); + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ + +.text +.align 32 +ENTRY(sha256_ni_transform) + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(DIGEST_PTR), STATE0 + movdqu 1*16(DIGEST_PTR), STATE1 + + pshufd $0xB1, STATE0, STATE0 /* CDAB */ + pshufd $0x1B, STATE1, STATE1 /* EFGH */ + movdqa STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + movdqa MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + movdqa MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + movdqa MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + movdqa MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + movdqa MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + movdqa MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + movdqa MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + movdqa MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + movdqa MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + movdqa MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + movdqa MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + movdqa MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, STATE0, STATE0 /* FEBA */ + pshufd $0xB1, STATE1, STATE1 /* DCHG */ + movdqa STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movdqu STATE0, 0*16(DIGEST_PTR) + movdqu STATE1, 1*16(DIGEST_PTR) + +.Ldone_hash: + + ret +ENDPROC(sha256_ni_transform) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f8097fc0d1d1..5f4d6086dc59 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -42,19 +42,10 @@ asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(u32 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, - u64 rounds); -#endif - -static void (*sha256_transform_asm)(u32 *, const char *, u64); +typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha256_transform_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -67,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return 0; } -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) { if (!irq_fpu_usable()) return crypto_sha256_finup(desc, data, len, out); @@ -82,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return sha256_base_finish(desc, out); } +static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_ssse3); +} + +static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) { return sha256_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha256_ssse3_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_ssse3_update, @@ -127,10 +130,77 @@ static struct shash_alg algs[] = { { } } }; +static int register_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); + return 0; +} + +static void unregister_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha256_transform_avx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_avx); +} + +static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_avx); +} + +static int sha256_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_avx_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx_usable(void) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; @@ -138,47 +208,216 @@ static bool __init avx_usable(void) return true; } -#endif -static int __init sha256_ssse3_mod_init(void) +static int register_sha256_avx(void) { - /* test for SSSE3 first */ - if (cpu_has_ssse3) - sha256_transform_asm = sha256_transform_ssse3; + if (avx_usable()) + return crypto_register_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); + return 0; +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) - sha256_transform_asm = sha256_transform_rorx; - else +static void unregister_sha256_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); +} + +#else +static inline int register_sha256_avx(void) { return 0; } +static inline void unregister_sha256_avx(void) { } #endif - sha256_transform_asm = sha256_transform_avx; + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_rorx); +} + +static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_rorx); +} + +static int sha256_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_avx2_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, } -#endif +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; - if (sha256_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha256_transform_asm == sha256_transform_avx) - pr_info("Using AVX optimized SHA-256 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha256_transform_asm == sha256_transform_rorx) - pr_info("Using AVX2 optimized SHA-256 implementation\n"); +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha256_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); + return 0; +} + +static void unregister_sha256_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); +} + +#else +static inline int register_sha256_avx2(void) { return 0; } +static inline void unregister_sha256_avx2(void) { } #endif - else + +#ifdef CONFIG_AS_SHA256_NI +asmlinkage void sha256_ni_transform(u32 *digest, const char *data, + u64 rounds); /*unsigned int rounds);*/ + +static int sha256_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_ni_transform); +} + +static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_ni_transform); +} + +static int sha256_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha256_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_ni_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int register_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); + return 0; +} + +static void unregister_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); +} + +#else +static inline int register_sha256_ni(void) { return 0; } +static inline void unregister_sha256_ni(void) { } #endif - pr_info("Using SSSE3 optimized SHA-256 implementation\n"); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + +static int __init sha256_ssse3_mod_init(void) +{ + if (register_sha256_ssse3()) + goto fail; + + if (register_sha256_avx()) { + unregister_sha256_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + if (register_sha256_avx2()) { + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + if (register_sha256_ni()) { + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha256_ni(); + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); } module_init(sha256_ssse3_mod_init); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 2edad7b81870..34e5083d6f36 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -41,19 +41,11 @@ asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(u64 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, - u64 rounds); -#endif -static void (*sha512_transform_asm)(u64 *, const char *, u64); +typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds); -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha512_transform_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); @@ -66,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return 0; } -static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) { if (!irq_fpu_usable()) return crypto_sha512_finup(desc, data, len, out); @@ -81,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); - sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_ssse3); +} + +static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, @@ -126,10 +130,27 @@ static struct shash_alg algs[] = { { } } }; +static int register_sha512_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); + return 0; +} + +static void unregister_sha512_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha512_transform_avx(u64 *digest, const char *data, + u64 rounds); +static bool avx_usable(void) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; @@ -137,47 +158,185 @@ static bool __init avx_usable(void) return true; } -#endif -static int __init sha512_ssse3_mod_init(void) +static int sha512_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - /* test for SSSE3 first */ - if (cpu_has_ssse3) - sha512_transform_asm = sha512_transform_ssse3; + return sha512_update(desc, data, len, sha512_transform_avx); +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2)) - sha512_transform_asm = sha512_transform_rorx; - else -#endif - sha512_transform_asm = sha512_transform_avx; +static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_avx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, } -#endif +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; - if (sha512_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha512_transform_asm == sha512_transform_avx) - pr_info("Using AVX optimized SHA-512 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha512_transform_asm == sha512_transform_rorx) - pr_info("Using AVX2 optimized SHA-512 implementation\n"); +static int register_sha512_avx(void) +{ + if (avx_usable()) + return crypto_register_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); + return 0; +} + +static void unregister_sha512_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); +} +#else +static inline int register_sha512_avx(void) { return 0; } +static inline void unregister_sha512_avx(void) { } #endif - else + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, + u64 rounds); + +static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_rorx); +} + +static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_rorx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx2_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha512_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); + return 0; +} + +static void unregister_sha512_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); +} +#else +static inline int register_sha512_avx2(void) { return 0; } +static inline void unregister_sha512_avx2(void) { } #endif - pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + +static int __init sha512_ssse3_mod_init(void) +{ + + if (register_sha512_ssse3()) + goto fail; + + if (register_sha512_avx()) { + unregister_sha512_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + if (register_sha512_avx2()) { + unregister_sha512_avx(); + unregister_sha512_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha512_avx2(); + unregister_sha512_avx(); + unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index c2bd0ce718ee..b7a3904b953c 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -558,7 +558,7 @@ static int __init twofish_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3c71dd947c7b..e32206e09868 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,3 +1,5 @@ +#include <linux/jump_label.h> + /* x86 function call convention, 64-bit: @@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with #endif /* CONFIG_X86_64 */ +/* + * This does 'call enter_from_user_mode' unless we can avoid it based on + * kernel config or using the static jump infrastructure. + */ +.macro CALL_enter_from_user_mode +#ifdef CONFIG_CONTEXT_TRACKING +#ifdef HAVE_JUMP_LABEL + STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 +#endif + call enter_from_user_mode +.Lafter_call_\@: +#endif +.endm diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 80dcc9261ca3..03663740c866 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -24,10 +24,19 @@ #include <asm/desc.h> #include <asm/traps.h> +#include <asm/vdso.h> +#include <asm/uaccess.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible void enter_from_user_mode(void) @@ -66,13 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) */ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { + struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long ret = 0; u32 work; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); - work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; #ifdef CONFIG_CONTEXT_TRACKING /* @@ -154,11 +164,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, unsigned long phase1_result) { + struct thread_info *ti = pt_regs_to_thread_info(regs); long ret = 0; - u32 work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -207,19 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} +#define EXIT_TO_USERMODE_LOOP_FLAGS \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) -/* Called with IRQs disabled. */ -__visible void prepare_exit_to_usermode(struct pt_regs *regs) +static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { - if (WARN_ON(!irqs_disabled())) - local_irq_disable(); - /* * In order to return to user mode, we need to have IRQs off with * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, @@ -229,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) * work to clear some of the flags can sleep. */ while (true) { - u32 cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); - - if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED | - _TIF_USER_RETURN_NOTIFY))) - break; - /* We have work to do. */ local_irq_enable(); @@ -260,50 +256,81 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) /* Disable IRQs and retry */ local_irq_disable(); + + cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + break; + } +} + +/* Called with IRQs disabled. */ +__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +{ + u32 cached_flags; + + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) + local_irq_disable(); + + lockdep_sys_exit(); + + cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + exit_to_usermode_loop(regs, cached_flags); user_enter(); } +#define SYSCALL_EXIT_WORK_FLAGS \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + +static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +{ + bool step; + + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); +} + /* * Called with IRQs on and fully valid regs. Returns with IRQs off in a * state such that we can immediately switch to user mode. */ -__visible void syscall_return_slowpath(struct pt_regs *regs) +__visible inline void syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags = READ_ONCE(ti->flags); - bool step; CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", - regs->orig_ax)) + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) local_irq_enable(); /* * First do one-time work. If these work items are enabled, we * want to run them exactly once per syscall exit with IRQs on. */ - if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); - } + if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) + syscall_slow_exit_work(regs, cached_flags); #ifdef CONFIG_COMPAT /* @@ -316,3 +343,144 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) local_irq_disable(); prepare_exit_to_usermode(regs); } + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +/* + * Does a 32-bit syscall. Called with IRQs on and does all entry and + * exit work and returns with IRQs off. This function is extremely hot + * in workloads that use it, and it's usually called from + * do_fast_syscall_32, so forcibly inline it to improve performance. + */ +#ifdef CONFIG_X86_32 +/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ +__visible +#else +/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ +static +#endif +__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned int nr = (unsigned int)regs->orig_ax; + +#ifdef CONFIG_IA32_EMULATION + ti->status |= TS_COMPAT; +#endif + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + /* + * Subtlety here: if ptrace pokes something larger than + * 2^32-1 into orig_ax, this truncates it. This may or + * may not be necessary, but it matches the old asm + * behavior. + */ + nr = syscall_trace_enter(regs); + } + + if (likely(nr < IA32_NR_syscalls)) { + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that + * the high bits are zero. Make sure we zero-extend all + * of the args. + */ + regs->ax = ia32_sys_call_table[nr]( + (unsigned int)regs->bx, (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->si, + (unsigned int)regs->di, (unsigned int)regs->bp); + } + + syscall_return_slowpath(regs); +} + +#ifdef CONFIG_X86_64 +/* Handles INT80 on 64-bit kernels */ +__visible void do_syscall_32_irqs_off(struct pt_regs *regs) +{ + local_irq_enable(); + do_syscall_32_irqs_on(regs); +} +#endif + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible long do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + /* + * Fetch EBP from where the vDSO stashed it. + * + * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! + */ + local_irq_enable(); + if ( +#ifdef CONFIG_X86_64 + /* + * Micro-optimization: the pointer we're following is explicitly + * 32 bits, so it can't be out of range. + */ + __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#else + get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#endif + ) { + + /* User code screwed up. */ + local_irq_disable(); + regs->ax = -EFAULT; +#ifdef CONFIG_CONTEXT_TRACKING + enter_from_user_mode(); +#endif + prepare_exit_to_usermode(regs); + return 0; /* Keep it simple: use IRET. */ + } + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs); + +#ifdef CONFIG_X86_64 + /* + * Opportunistic SYSRETL: if possible, try to return using SYSRETL. + * SYSRETL is available on all 64-bit CPUs, so we don't need to + * bother with SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + */ + return regs->cs == __USER32_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; +#else + /* + * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + * + * We don't allow syscalls at all from VM86 mode, but we still + * need to check VM, because we might be returning from sys_vm86. + */ + return static_cpu_has(X86_FEATURE_SEP) && + regs->cs == __USER_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; +#endif +} +#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b2909bf8cf70..77d8c5112900 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -3,7 +3,7 @@ * * entry_32.S contains the system-call and low-level fault and trap handling routines. * - * Stack layout in 'syscall_exit': + * Stack layout while running C code: * ptrace needs to have all registers on the stack. * If the order here is changed, it needs to be * updated in fork.c:copy_process(), signal.c:do_signal(), @@ -153,13 +153,13 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL +.macro SAVE_ALL pt_regs_ax=%eax cld PUSH_GS pushl %fs pushl %es pushl %ds - pushl %eax + pushl \pt_regs_ax pushl %ebp pushl %edi pushl %esi @@ -211,7 +211,11 @@ ENTRY(ret_from_fork) popl %eax pushl $0x0202 # Reset kernel eflags popfl - jmp syscall_exit + + /* When we fork, we trace the syscall return in the child, too. */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all END(ret_from_fork) ENTRY(ret_from_kernel_thread) @@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread) movl PT_EBP(%esp), %eax call *PT_EBX(%esp) movl $0, PT_EAX(%esp) - jmp syscall_exit + + /* + * Kernel threads return to userspace as if returning from a syscall. + * We should check whether anything actually uses this path and, if so, + * consider switching it over to ret_from_fork. + */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all ENDPROC(ret_from_kernel_thread) /* @@ -255,7 +267,6 @@ ret_from_intr: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl %esp, %eax @@ -276,77 +287,50 @@ need_resched: END(resume_kernel) #endif -/* - * SYSENTER_RETURN points to after the SYSENTER instruction - * in the vsyscall page. See vsyscall-sysentry.S, which defines - * the symbol. - */ - # SYSENTER call handler stub ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: + pushl $__USER_DS /* pt_regs->ss */ + pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushfl /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ + pushl $__USER_CS /* pt_regs->cs */ + pushl $0 /* pt_regs->ip = 0 (placeholder) */ + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3, %ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp), %ebp - ASM_CLAC - movl %ebp, PT_EBP(%esp) - _ASM_EXTABLE(1b, syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(, %eax, 4) -sysenter_after_call: - movl %eax, PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz syscall_exit_work_irqs_off -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp, %ebp - TRACE_IRQS_ON + + movl %esp, %eax + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + +/* Opportunistic SYSEXIT */ + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movl PT_EIP(%esp), %edx /* pt_regs->ip */ + movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT + popl %ebx /* pt_regs->bx */ + addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + + /* + * Return back to the vDSO, which will pop ecx and edx. + * Don't bother with DS and ES (they already contain __USER_DS). + */ + sti + sysexit .pushsection .fixup, "ax" 2: movl $0, PT_FS(%esp) @@ -359,21 +343,18 @@ ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(entry_INT80_32) ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(, %eax, 4) -syscall_after_call: - movl %eax, PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - jmp syscall_exit_work + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + + /* + * User mode is traced as though IRQs are on. Unlike the 64-bit + * case, INT80 is a trap gate on 32-bit kernels, so interrupts + * are already on (unless user code is messing around with iopl). + */ + + movl %esp, %eax + call do_syscall_32_irqs_on +.Lsyscall_32_done: restore_all: TRACE_IRQS_IRET @@ -450,47 +431,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS, PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - -syscall_exit_work: - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT, PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS, %eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS, %eax - jmp sysenter_after_call -END(sysenter_badsys) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack @@ -613,11 +553,6 @@ ENTRY(native_iret) iret _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) #endif ENTRY(overflow) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 055a01de7c8d..9d34d3cfceb6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -391,20 +391,16 @@ GLOBAL(stub_execveat) jmp return_from_execve END(stub_execveat) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_X32_ABI) .align 8 GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) call compat_sys_execve jmp return_from_execve -END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) call compat_sys_execveat jmp return_from_execve -END(stub32_execveat) END(stub_x32_execveat) #endif @@ -513,9 +509,18 @@ END(irq_entries_start) * tracking that we're in kernel mode. */ SWAPGS -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif + + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, + * the simplest way to handle it is to just call it twice if + * we enter from user mode. There's no reason to optimize this since + * TRACE_IRQS_OFF is a no-op if lockdep is off. + */ + TRACE_IRQS_OFF + + CALL_enter_from_user_mode 1: /* @@ -557,7 +562,6 @@ ret_from_intr: jz retint_kernel /* Interrupt came from user space */ - LOCKDEP_SYS_EXIT_IRQ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode @@ -587,7 +591,7 @@ retint_kernel: * At this label, code paths which return to kernel and to user, * which come from interrupts/exception and from syscalls, merge. */ -restore_regs_and_iret: +GLOBAL(restore_regs_and_iret) RESTORE_EXTRA_REGS restore_c_regs_and_iret: RESTORE_C_REGS @@ -1054,12 +1058,16 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). + */ + TRACE_IRQS_OFF + CALL_enter_from_user_mode + ret .Lerror_entry_done: - TRACE_IRQS_OFF ret diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index a9360d40fb7f..ff1c6d61f332 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -16,25 +16,8 @@ #include <linux/linkage.h> #include <linux/err.h> -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call_irqs_off -# define sysretl_audit ia32_ret_from_sys_call_irqs_off -#endif - .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - /* * 32-bit SYSENTER instruction entry. * @@ -58,219 +41,88 @@ ENDPROC(native_usergs_sysret32) * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ + pushq %rbp /* pt_regs->sp (stashed in bp) */ + + /* + * Push flags. This is nasty. First, interrupts are currently + * off, but we need pt_regs->flags to have IF set. Second, even + * if TF was set when SYSENTER started, it's clear by now. We fix + * that later using TIF_SINGLESTEP. + */ + pushfq /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + ASM_CLAC /* Clear AC after saving FLAGS */ + pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC /* * Sysenter doesn't filter flags, so we need to clear NT * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - movq RAX(%rsp), %rax - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - /* - * At this point, registers hold syscall args in the 32-bit syscall ABI: - * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. * - * We want to pass them to __audit_syscall_entry(), which is a 64-bit - * C function with 5 parameters, so shuffle them to match what - * the function expects: RDI,RSI,RDX,RCX,R8. + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. */ - movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ - xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ - /* arg3 (RDX) <= 2nd syscall arg (ECX) */ - movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ - movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ - call __audit_syscall_entry + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: /* - * We are going to jump back to the syscall dispatch code. - * Prepare syscall args as required by the 64-bit C ABI. - * Registers clobbered by __audit_syscall_entry() are - * loaded from pt_regs on stack: + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. */ - movl ORIG_RAX(%rsp), %eax /* syscall number */ - movl %ebx, %edi /* arg1 */ - movl RCX(%rsp), %esi /* arg2 */ - movl RDX(%rsp), %edx /* arg3 */ - movl RSI(%rsp), %ecx /* arg4 */ - movl RDI(%rsp), %r8d /* arg5 */ - .endm - - .macro auditsys_exit exit - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call_irqs_off - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif + movq %rsp, %rdi + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + jmp sysret32_from_system_call -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) +.Lsysenter_fix_flags: + pushq $X86_EFLAGS_FIXED popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call + jmp .Lsysenter_flags_fixed ENDPROC(entry_SYSENTER_compat) /* @@ -298,21 +150,14 @@ ENDPROC(entry_SYSENTER_compat) * edi arg5 * esp user stack * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSCALL_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK + + /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %eax, %eax @@ -327,162 +172,69 @@ ENTRY(entry_SYSCALL_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx + pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. */ - ASM_STAC -1: movl (%r8), %r9d - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - movq RAX(%rsp), %rax - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). + movq %rsp, %rdi + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + + /* Opportunistic SYSRET */ +sysret32_from_system_call: + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ + movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ + addq $RAX, %rsp /* Skip r8-r15 */ + popq %rax /* pt_regs->rax */ + popq %rdx /* Skip pt_regs->cx */ + popq %rdx /* pt_regs->dx */ + popq %rsi /* pt_regs->si */ + popq %rdi /* pt_regs->di */ + + /* + * USERGS_SYSRET32 does: + * GSBASE = user's GS base + * EIP = ECX + * RFLAGS = R11 + * CS = __USER32_CS + * SS = __USER_DS + * + * ECX will not match pt_regs->cx, but we're returning to a vDSO + * trampoline that will fix up RCX, so this is okay. * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - movl %r9d, R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp), %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - xchgl %r9d, %ebp - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %r9, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - movl R9(%rsp), %r9d - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - xchgl %ebp, %r9d - jmp cstar_do_call + * R12-R15 are callee-saved, so they contain whatever was in them + * when the system call started, which is already known to user + * code. We zero R8-R10 to avoid info leaks. + */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movq RSP-ORIG_RAX(%rsp), %rsp + swapgs + sysretl END(entry_SYSCALL_compat) -ia32_badarg: - /* - * So far, we've entered kernel mode, set AC, turned on IRQs, and - * saved C regs except r8-r11. We haven't done any of the other - * standard entry work, though. We want to bail, but we shouldn't - * treat this as a syscall entry since we don't even know what the - * args are. Instead, treat this as a non-syscall entry, finish - * the entry work, and immediately exit after setting AX = -EFAULT. - * - * We're really just being polite here. Killing the task outright - * would be a reasonable action, too. Given that the only valid - * way to have gotten here is through the vDSO, and we already know - * that the stack pointer is bad, the task isn't going to survive - * for long no matter what we do. - */ - - ASM_CLAC /* undo STAC */ - movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ - - /* Fill in the rest of pt_regs */ - xorl %eax, %eax - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - SAVE_EXTRA_REGS - - /* Turn IRQs back off. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - - /* Now finish entering normal kernel mode. */ -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif - - /* And exit again. */ - jmp retint_user - -ia32_ret_from_sys_call_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - /* * Emulated IA32 system calls via int 0x80. * @@ -507,14 +259,17 @@ ia32_ret_from_sys_call: ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. */ PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - /* Zero-extending 32-bit regs, do not remove */ + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ @@ -524,67 +279,37 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(entry_INT80_compat) + TRACE_IRQS_OFF - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm + movq %rsp, %rdi + call do_syscall_32_irqs_off +.Lsyscall_32_done: - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork + /* Go back to user mode. */ + TRACE_IRQS_ON + SWAPGS + jmp restore_regs_and_iret +END(entry_INT80_compat) ALIGN GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -593,12 +318,4 @@ GLOBAL(stub32_clone) * so we need to swap arguments here before calling it: */ xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) + jmp sys_clone diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8ea34f94e973..9a6649857106 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -4,24 +4,21 @@ #include <linux/sys.h> #include <linux/cache.h> #include <asm/asm-offsets.h> +#include <asm/syscall.h> #ifdef CONFIG_IA32_EMULATION #define SYM(sym, compat) compat #else #define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_syscall_compat_max __NR_syscall_max #endif -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 4ac730b37f0b..41283d22be7a 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -14,13 +14,13 @@ # define __SYSCALL_X32(nr, sym, compat) /* nothing */ #endif -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_64.h> #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -extern void sys_ni_syscall(void); +extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7663c455b9f6..cb713df81180 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork sys_fork stub32_fork +2 i386 fork sys_fork sys_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -17,7 +17,7 @@ 8 i386 creat sys_creat 9 i386 link sys_link 10 i386 unlink sys_unlink -11 i386 execve sys_execve stub32_execve +11 i386 execve sys_execve compat_sys_execve 12 i386 chdir sys_chdir 13 i386 time sys_time compat_sys_time 14 i386 mknod sys_mknod @@ -125,7 +125,7 @@ 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 117 i386 ipc sys_ipc compat_sys_ipc 118 i386 fsync sys_fsync -119 i386 sigreturn sys_sigreturn stub32_sigreturn +119 i386 sigreturn sys_sigreturn sys32_sigreturn 120 i386 clone sys_clone stub32_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname @@ -179,7 +179,7 @@ 170 i386 setresgid sys_setresgid16 171 i386 getresgid sys_getresgid16 172 i386 prctl sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending @@ -196,7 +196,7 @@ 187 i386 sendfile sys_sendfile compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork stub32_vfork +190 i386 vfork sys_vfork sys_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 @@ -364,7 +364,7 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf -358 i386 execveat sys_execveat stub32_execveat +358 i386 execveat sys_execveat compat_sys_execveat 359 i386 socket sys_socket 360 i386 socketpair sys_socketpair 361 i386 bind sys_bind @@ -382,3 +382,5 @@ 373 i386 shutdown sys_shutdown 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier +376 i386 mlock2 sys_mlock2 +377 i386 copy_file_range sys_copy_file_range diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 278842fdf1f6..dc1040a50bdc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -331,6 +331,8 @@ 322 64 execveat stub_execveat 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 +326 common copy_file_range sys_copy_file_range # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index a3d0767a6b29..c854541d93ff 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -4,6 +4,7 @@ KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n +UBSAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y @@ -19,9 +20,7 @@ obj-y += vma.o # vDSO images to build vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall -vdso_img-$(VDSO32-y) += 32-sysenter +vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o @@ -69,7 +68,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(vobjs): KBUILD_CFLAGS += $(CFL) @@ -122,15 +121,6 @@ $(obj)/%.so: $(obj)/%.so.dbg $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) -# -# Build multiple 32-bit vDSO images to choose from at boot time. -# -vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_IA32_EMULATION) += syscall -vdso32.so-$(VDSO32-y) += sysenter - -vdso32-images = $(vdso32.so-y:%=vdso32-%.so) - CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 @@ -139,14 +129,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o targets += vdso32/vclock_gettime.o -$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) - -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO +$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) @@ -157,13 +145,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o +$(obj)/vdso32.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/system_call.o $(call if_changed,vdso) # @@ -206,4 +194,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE PHONY += vdso_install $(vdso_img_insttargets) vdso_install: $(vdso_img_insttargets) FORCE -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* +clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index ca94fa649251..1a50e09c945b 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -17,8 +17,10 @@ #include <asm/vvar.h> #include <asm/unistd.h> #include <asm/msr.h> +#include <asm/pvclock.h> #include <linux/math64.h> #include <linux/time.h> +#include <linux/kernel.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void) } #endif -#ifndef BUILD_VDSO32 +#ifdef CONFIG_PARAVIRT_CLOCK +extern u8 pvclock_page + __attribute__((visibility("hidden"))); +#endif -#include <linux/kernel.h> -#include <asm/vsyscall.h> -#include <asm/fixmap.h> -#include <asm/pvclock.h> +#ifndef BUILD_VDSO32 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { @@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) -{ - const struct pvclock_vsyscall_time_info *pvti_base; - int idx = cpu / (PAGE_SIZE/PVTI_SIZE); - int offset = cpu % (PAGE_SIZE/PVTI_SIZE); - - BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); - - pvti_base = (struct pvclock_vsyscall_time_info *) - __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); - - return &pvti_base[offset]; -} - -static notrace cycle_t vread_pvclock(int *mode) -{ - const struct pvclock_vsyscall_time_info *pvti; - cycle_t ret; - u64 last; - u32 version; - u8 flags; - unsigned cpu, cpu1; - - - /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * - */ - do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) - *mode = VCLOCK_NONE; - - /* refer to tsc.c read_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; -} -#endif #else @@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } +#endif + #ifdef CONFIG_PARAVIRT_CLOCK +static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) +{ + return (const struct pvclock_vsyscall_time_info *)&pvclock_page; +} static notrace cycle_t vread_pvclock(int *mode) { - *mode = VCLOCK_NONE; - return 0; -} -#endif + const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; + cycle_t ret; + u64 tsc, pvti_tsc; + u64 last, delta, pvti_system_time; + u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift; + /* + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. + */ + + do { + version = pvti->version; + + smp_rmb(); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + + tsc = rdtsc_ordered(); + pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; + pvti_tsc_shift = pvti->tsc_shift; + pvti_system_time = pvti->system_time; + pvti_tsc = pvti->tsc_timestamp; + + /* Make sure that the version double-check is last. */ + smp_rmb(); + } while (unlikely((version & 1) || version != pvti->version)); + + delta = tsc - pvti_tsc; + ret = pvti_system_time + + pvclock_scale_delta(delta, pvti_tsc_to_system_mul, + pvti_tsc_shift); + + /* refer to vread_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} #endif notrace static cycle_t vread_tsc(void) diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index de2c921025f5..4158acc17df0 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 2 * PAGE_SIZE; + vvar_start = . - 3 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -36,6 +36,7 @@ SECTIONS #undef EMIT_VVAR hpet_page = vvar_start + PAGE_SIZE; + pvclock_page = vvar_start + 2 * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8627db24a7f6..491020b2826d 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -73,6 +73,7 @@ enum { sym_vvar_start, sym_vvar_page, sym_hpet_page, + sym_pvclock_page, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -80,6 +81,7 @@ enum { const int special_pages[] = { sym_vvar_page, sym_hpet_page, + sym_pvclock_page, }; struct vdso_sym { @@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, + [sym_pvclock_page] = {"pvclock_page", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, @@ -98,10 +101,10 @@ struct vdso_sym required_syms[] = { "VDSO_FAKE_SECTION_TABLE_END", false }, {"VDSO32_NOTE_MASK", true}, - {"VDSO32_SYSENTER_RETURN", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, + {"int80_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index e904c270573b..08a317a9ae4b 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -#ifdef CONFIG_X86_64 - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) -#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) - -#else /* CONFIG_X86_32 */ - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) -#define vdso32_syscall() (0) - -#endif /* CONFIG_X86_64 */ - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -const struct vdso_image *selected_vdso32; -#endif - int __init sysenter_setup(void) { -#ifdef CONFIG_COMPAT - if (vdso32_syscall()) - selected_vdso32 = &vdso_image_32_syscall; - else -#endif - if (vdso32_sysenter()) - selected_vdso32 = &vdso_image_32_sysenter; - else - selected_vdso32 = &vdso_image_32_int80; - - init_vdso_image(selected_vdso32); + init_vdso_image(&vdso_image_32); return 0; } diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S deleted file mode 100644 index b15b7c01aedb..000000000000 --- a/arch/x86/entry/vdso/vdso32/int80.S +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Code for the vDSO. This version uses the old int $0x80 method. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S deleted file mode 100644 index 6b286bb5251c..000000000000 --- a/arch/x86/entry/vdso/vdso32/syscall.S +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Code for the vDSO. This version uses the syscall instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" - -#include <asm/segment.h> - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S deleted file mode 100644 index e354bceee0e0..000000000000 --- a/arch/x86/entry/vdso/vdso32/sysenter.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Code for the vDSO. This version uses the sysenter instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three words on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - int $0x80 - /* 16: System call normal return point is here! */ -VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Emit a symbol with the size of this .eh_frame data, - * to verify it matches the other versions. - */ -VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S new file mode 100644 index 000000000000..3a1d9297074b --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -0,0 +1,89 @@ +/* + * AT_SYSINFO entry point +*/ + +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> + +/* + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: + CFI_STARTPROC + /* + * Reshuffle regs so that all of any of the entry instructions + * will preserve enough state. + * + * A really nice entry sequence would be: + * pushl %edx + * pushl %ecx + * movl %esp, %ecx + * + * Unfortunately, naughty Android versions between July and December + * 2015 actually hardcode the traditional Linux SYSENTER entry + * sequence. That is severely broken for a number of reasons (ask + * anyone with an AMD CPU, for example). Nonetheless, we try to keep + * it working approximately as well as it ever worked. + * + * This link may eludicate some of the history: + * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 + * personally, I find it hard to understand what's going on there. + * + * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. + * Execute an indirect call to the address in the AT_SYSINFO auxv + * entry. That is the ONLY correct way to make a fast 32-bit system + * call on Linux. (Open-coding int $0x80 is also fine, but it's + * slow.) + */ + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + + #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" + #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" + +#ifdef CONFIG_X86_64 + /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 +#else + ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP +#endif + + /* Enter using int $0x80 */ + int $0x80 +GLOBAL(int80_landing_pad) + + /* + * Restore EDX and ECX in case they were clobbered. EBP is not + * clobbered (the kernel restores it), but it's cleaner and + * probably faster to pop it than to adjust ESP using addl. + */ + popl %ebp + CFI_RESTORE ebp + CFI_ADJUST_CFA_OFFSET -4 + popl %edx + CFI_RESTORE edx + CFI_ADJUST_CFA_OFFSET -4 + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + + .size __kernel_vsyscall,.-__kernel_vsyscall + .previous diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 175cc72c0f68..87a86e017f0e 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -14,11 +14,13 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS #define CONFIG_X86_32 1 +#define CONFIG_PGTABLE_LEVELS 2 #define CONFIG_PAGE_OFFSET 0 #define CONFIG_ILLEGAL_POINTER_VALUE 0 #define CONFIG_NR_CPUS 1 diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 434543145d78..b8f69e264ac4 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -12,6 +12,7 @@ #include <linux/random.h> #include <linux/elf.h> #include <linux/cpu.h> +#include <asm/pvclock.h> #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> @@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) .name = "[vvar]", .pages = no_pages, }; + struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) } #endif + pvti = pvclock_pvti_cpu0_va(); + if (pvti && image->sym_pvclock_page) { + ret = remap_pfn_range(vma, + text_start + image->sym_pvclock_page, + __pa(pvti) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + } + up_fail: if (ret) current->mm->context.vdso = NULL; @@ -180,21 +194,10 @@ up_fail: #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { - int ret; - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; - ret = map_vdso(selected_vdso32, false); - if (ret) - return ret; - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - return 0; + return map_vdso(&vdso_image_32, false); } #endif diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index b160c0c6baed..174c2549939d 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -38,7 +38,14 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = +#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE) + NATIVE; +#elif defined(CONFIG_LEGACY_VSYSCALL_NONE) + NONE; +#else + EMULATE; +#endif static int __init vsyscall_setup(char *str) { diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a0a19b7ba22d..0552884da18d 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -26,7 +26,7 @@ #include <asm/ptrace.h> #include <asm/ia32_unistd.h> #include <asm/user32.h> -#include <asm/sigcontext32.h> +#include <uapi/asm/sigcontext.h> #include <asm/proto.h> #include <asm/vdso.h> #include <asm/sigframe.h> @@ -68,7 +68,7 @@ } static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_ia32 __user *sc) + struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; void __user *buf; @@ -170,7 +170,7 @@ badframe: * Set up a signal frame. */ -static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, +static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned int mask) { @@ -234,7 +234,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_ia32 __user *) sp; + *fpstate = (struct _fpstate_32 __user *) sp; if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, math_size) < 0) return (void __user *) -1L; @@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; } @@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, restorer = ksig->ka.sa.sa_restorer; else restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; put_user_ex(ptr_to_compat(restorer), &frame->pretcode); /* diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 3a45668f6dc3..94c18ebfd68c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -32,6 +32,10 @@ #include <asm/mpspec.h> #include <asm/realmode.h> +#ifdef CONFIG_ACPI_APEI +# include <asm/pgtable_types.h> +#endif + #ifdef CONFIG_ACPI extern int acpi_lapic; extern int acpi_ioapic; @@ -147,4 +151,23 @@ extern int x86_acpi_numa_init(void); #define acpi_unlazy_tlb(x) leave_mm(x) +#ifdef CONFIG_ACPI_APEI +static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) +{ + /* + * We currently have no way to look up the EFI memory map + * attributes for a region in a consistent way, because the + * memmap is discarded after efi_free_boot_services(). So if + * you call efi_mem_attributes() during boot and at runtime, + * you could theoretically see different attributes. + * + * Since we are yet to see any x86 platforms that require + * anything other than PAGE_KERNEL (some arm64 platforms + * require the equivalent of PAGE_KERNEL_NOCACHE), return that + * until we know differently. + */ + return PAGE_KERNEL; +} +#endif + #endif /* _ASM_X86_ACPI_H */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 1a5da2e63aee..3c56ef1ae068 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -81,7 +81,7 @@ static inline struct amd_northbridge *node_to_amd_nb(int node) return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } -static inline u16 amd_get_node_id(struct pci_dev *pdev) +static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { struct pci_dev *misc; int i; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ebf6d5e5668c..c80f6b6f3da2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -23,6 +23,11 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 +/* Macros for apic_extnmi which controls external NMI masking */ +#define APIC_EXTNMI_BSP 0 /* Default */ +#define APIC_EXTNMI_ALL 1 +#define APIC_EXTNMI_NONE 2 + /* * Define the default level of output to be very little * This can be turned up by using apic=verbose for more @@ -115,6 +120,59 @@ static inline bool apic_is_x2apic_enabled(void) return msr & X2APIC_ENABLE; } +extern void enable_IR_x2apic(void); + +extern int get_physical_broadcast(void); + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void init_apic_mappings(void); +void register_lapic_address(unsigned long address); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else +extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop +#endif /* !CONFIG_X86_LOCAL_APIC */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -186,67 +244,14 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) -#else +#else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) -#endif - -extern void enable_IR_x2apic(void); - -extern int get_physical_broadcast(void); - -extern int lapic_get_maxlvt(void); -extern void clear_local_APIC(void); -extern void disconnect_bsp_APIC(int virt_wire_setup); -extern void disable_local_APIC(void); -extern void lapic_shutdown(void); -extern void sync_Arb_IDs(void); -extern void init_bsp_APIC(void); -extern void setup_local_APIC(void); -extern void init_apic_mappings(void); -void register_lapic_address(unsigned long address); -extern void setup_boot_APIC_clock(void); -extern void setup_secondary_APIC_clock(void); -extern int APIC_init_uniprocessor(void); - -#ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) -{ - return -1; -} -#else -extern int apic_force_enable(unsigned long addr); -#endif - -extern int apic_bsp_setup(bool upmode); -extern void apic_ap_setup(void); - -/* - * On 32bit this is mach-xxx local - */ -#ifdef CONFIG_X86_64 -extern int apic_is_clustered_box(void); -#else -static inline int apic_is_clustered_box(void) -{ - return 0; -} -#endif - -extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); - -#else /* !CONFIG_X86_LOCAL_APIC */ -static inline void lapic_shutdown(void) { } -#define local_apic_timer_c2_ok 1 -static inline void init_apic_mappings(void) { } -static inline void disable_local_APIC(void) { } -# define setup_boot_APIC_clock x86_init_noop -# define setup_secondary_APIC_clock x86_init_noop -#endif /* !CONFIG_X86_LOCAL_APIC */ +#endif /* !CONFIG_X86_X2APIC */ #ifdef CONFIG_X86_64 #define SET_APIC_ID(x) (apic->set_apic_id(x)) @@ -303,6 +308,7 @@ struct apic { unsigned int *apicid); /* ipi */ + void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *mask, int vector); diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index fb52aa644aab..3e8674288198 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -3,7 +3,6 @@ #include <linux/compiler.h> #include <linux/types.h> -#include <asm/processor.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> @@ -24,7 +23,7 @@ */ static __always_inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -36,7 +35,7 @@ static __always_inline int atomic_read(const atomic_t *v) */ static __always_inline void atomic_set(atomic_t *v, int i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index a11c30b77fb5..a984111135b1 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -3,7 +3,6 @@ #include <linux/compiler.h> #include <linux/types.h> -#include <asm/processor.h> //#include <asm/cmpxchg.h> /* An 64bit atomic type */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 50e33eff58de..037351022f54 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -18,7 +18,7 @@ */ static inline long atomic64_read(const atomic64_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v) */ static inline void atomic64_set(atomic64_t *v, long i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0681d2532527..a584e1c50918 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -31,20 +31,10 @@ #endif #define dma_wmb() barrier() -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() dma_rmb() -#define smp_wmb() barrier() -#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else /* !SMP */ -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) -#endif /* SMP */ - -#define read_barrier_depends() do { } while (0) -#define smp_read_barrier_depends() do { } while (0) +#define __smp_mb() mb() +#define __smp_rmb() dma_rmb() +#define __smp_wmb() barrier() +#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #if defined(CONFIG_X86_PPRO_FENCE) @@ -53,31 +43,31 @@ * model and we should fall back to full barriers. */ -#define smp_store_release(p, v) \ +#define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + __smp_mb(); \ WRITE_ONCE(*p, v); \ } while (0) -#define smp_load_acquire(p) \ +#define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + __smp_mb(); \ ___p1; \ }) #else /* regular x86 TSO memory ordering */ -#define smp_store_release(p, v) \ +#define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) -#define smp_load_acquire(p) \ +#define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ @@ -88,7 +78,9 @@ do { \ #endif /* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic() barrier() -#define smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() barrier() +#define __smp_mb__after_atomic() barrier() + +#include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 4fa687a47a62..6b8d6e8cd449 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -27,7 +27,7 @@ #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ -#define BOOT_HEAP_SIZE 0x8000 +#define BOOT_HEAP_SIZE 0x10000 #endif /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index 0d467b338835..a8303ebe089f 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -31,7 +31,7 @@ #include <asm/types.h> struct iommu_table { - struct cal_chipset_ops *chip_ops; /* chipset specific funcs */ + const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_hint; /* Hint for next alloc */ unsigned long *it_map; /* A simple allocation bitmap for now */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f7e142926481..e4959d023af8 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -109,6 +109,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) #endif -#define system_has_cmpxchg_double() cpu_has_cx8 +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 1af94697aae5..caa23a34c963 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -18,6 +18,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) cmpxchg_local((ptr), (o), (n)); \ }) -#define system_has_cmpxchg_double() cpu_has_cx16 +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index bf2caa1dedc5..678637ad7476 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -36,4 +36,7 @@ extern int _debug_hotplug_cpu(int cpu, int action); int mwait_usable(const struct cpuinfo_x86 *); +unsigned int x86_family(unsigned int sig); +unsigned int x86_model(unsigned int sig); +unsigned int x86_stepping(unsigned int sig); #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9727b3b48bd1..7ad8c9464297 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include <asm/disabled-features.h> #endif -#define NCAPINTS 13 /* N 32-bit words worth of info */ +#define NCAPINTS 16 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -181,22 +181,17 @@ /* * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc, word 7 + * CPUID levels like 0x6, 0xA etc, word 7. + * + * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ -#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ -#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ -#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ -#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ /* Virtualization flags: Linux defined, word 8 */ @@ -205,17 +200,9 @@ #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */ -#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */ -#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ -#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ + #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ @@ -255,6 +242,33 @@ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ + /* * BUG word(s) */ @@ -275,6 +289,26 @@ #include <asm/asm.h> #include <linux/bitops.h> +enum cpuid_leafs +{ + CPUID_1_EDX = 0, + CPUID_8000_0001_EDX, + CPUID_8086_0001_EDX, + CPUID_LNX_1, + CPUID_1_ECX, + CPUID_C000_0001_EDX, + CPUID_8000_0001_ECX, + CPUID_LNX_2, + CPUID_LNX_3, + CPUID_7_0_EBX, + CPUID_D_1_EAX, + CPUID_F_0_EDX, + CPUID_F_1_EDX, + CPUID_8000_0008_EBX, + CPUID_6_EAX, + CPUID_8000_000A_EDX, +}; + #ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; @@ -352,60 +386,31 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_de boot_cpu_has(X86_FEATURE_DE) #define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) -#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) -#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR) -#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) -#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) -#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) -#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) -#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) -#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE) -#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN) -#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT) -#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN) -#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2) -#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN) -#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE) -#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN) -#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) -#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) -#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) -#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) -#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) -#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) -#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) -#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) -#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) -#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB) -#define cpu_has_perfctr_l2 boot_cpu_has(X86_FEATURE_PERFCTR_L2) -#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) -#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) -#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) -#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) -#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT) - -#if __GNUC__ >= 4 +/* + * Do not add any more of those clumsy macros - use static_cpu_has_safe() for + * fast paths and boot_cpu_has() otherwise! + */ + +#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) extern void warn_pre_alternatives(void); extern bool __static_cpu_has_safe(u16 bit); diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 03dd72957d2f..684ed6c3aa67 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -10,6 +10,16 @@ struct dev_archdata { #endif }; +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +struct dma_domain { + struct list_head node; + struct dma_map_ops *dma_ops; + int domain_nr; +}; +void add_dma_domain(struct dma_domain *domain); +void del_dma_domain(struct dma_domain *domain); +#endif + struct pdev_archdata { }; diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 953b7263f844..3a27b93e6261 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -#include <asm-generic/dma-mapping-common.h> - extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h new file mode 100644 index 000000000000..b7a1ab865d68 --- /dev/null +++ b/arch/x86/include/asm/dwarf2.h @@ -0,0 +1,84 @@ +#ifndef _ASM_X86_DWARF2_H +#define _ASM_X86_DWARF2_H + +#ifndef __ASSEMBLY__ +#warning "asm/dwarf2.h should be only included in pure assembly files" +#endif + +/* + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. + */ + +#ifdef CONFIG_AS_CFI + +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined +#define CFI_ESCAPE .cfi_escape + +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + +#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) +#ifndef BUILD_VDSO + /* + * Emit CFI data in .debug_frame sections, not .eh_frame sections. + * The latter we currently just discard since we don't do DWARF + * unwinding at runtime. So only the offline DWARF information is + * useful to anyone. Note we should not use this directive if + * vmlinux.lds.S gets changed so it doesn't discard .eh_frame. + */ + .cfi_sections .debug_frame +#else + /* + * For the vDSO, emit both runtime unwind information and debug + * symbols for the .dbg file. + */ + .cfi_sections .eh_frame, .debug_frame +#endif +#endif + +#else + +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ +.macro cfi_ignore a=0, b=0, c=0, d=0 +.endm + +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore +#define CFI_DEF_CFA_REGISTER cfi_ignore +#define CFI_DEF_CFA_OFFSET cfi_ignore +#define CFI_ADJUST_CFA_OFFSET cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_ESCAPE cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore + +#endif + +#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ae68be92f755..0010c78c4998 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -105,6 +105,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); +extern void __init efi_print_memmap(void); extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 141c561f4664..1514753fd435 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,11 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - /* Commented-out registers are cleared in stub_execve */ - /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; - regs->si = regs->di /*= regs->bp*/ = 0; + /* ax gets execve's return value. */ + /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; + regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; @@ -328,7 +328,7 @@ else \ #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ - selected_vdso32->sym___kernel_vsyscall) + vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index f80d70009ff8..6d7d0e52ed5a 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,7 +19,6 @@ #include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> -#include <asm/pvclock.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> #include <asm/kmap_types.h> @@ -72,10 +71,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_VSYSCALL_EMULATION VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, #endif -#ifdef CONFIG_PARAVIRT_CLOCK - PVCLOCK_FIXMAP_BEGIN, - PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, -#endif #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3c3550c3a4a3..0fd440df63f1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); +extern u64 fpu__get_supported_xfeatures_mask(void); /* * Debugging facility: @@ -224,18 +225,67 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" -/* xstate instruction fault handler: */ -#define xstate_fault(__err) \ - \ - ".section .fixup,\"ax\"\n" \ - \ - "3: movl $-2,%[_err]\n" \ - " jmp 2b\n" \ - \ - ".previous\n" \ - \ - _ASM_EXTABLE(1b, 3b) \ - : [_err] "=r" (__err) +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + ".pushsection .fixup,\"ax\"\n\t" \ + "3: movl $-2,%[err]\n\t" \ + "jmp 2b\n\t" \ + ".popsection\n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact + * format and supervisor states in addition to modified optimization in + * XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_2(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + ".pushsection .fixup,\"ax\"\n" \ + "4: movl $-2, %[err]\n" \ + "jmp 3b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(661b, 4b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + ".pushsection .fixup,\"ax\"\n" \ + "4: movl $-2, %[err]\n" \ + "jmp 3b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(661b, 4b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") /* * This function is called only during boot time when x86 caps are not set @@ -246,22 +296,14 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; - int err = 0; + int err; WARN_ON(system_state != SYSTEM_BOOTING); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) - : "memory"); + if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) - : "memory"); + XSTATE_OP(XSAVE, xstate, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); @@ -276,22 +318,14 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; - int err = 0; + int err; WARN_ON(system_state != SYSTEM_BOOTING); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) - : "memory"); + if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault(err) - : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) - : "memory"); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* We should never fault when copying from a kernel buffer: */ WARN_ON_FPU(err); @@ -305,33 +339,11 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; - int err = 0; + int err; WARN_ON(!alternatives_patched); - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [xstate] "D" (xstate), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault(err) - : "0" (err) - : "memory"); + XSTATE_XSAVE(xstate, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); @@ -344,23 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err = 0; + int err; - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault(err) - : "0" (err) - : "memory"); + XSTATE_XRESTORE(xstate, lmask, hmask, err); /* We should never fault when copying from a kernel buffer: */ WARN_ON_FPU(err); @@ -388,12 +386,10 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) if (unlikely(err)) return -EFAULT; - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault(err) - : "D" (buf), "a" (-1), "d" (-1), "0" (err) - : "memory"); + stac(); + XSTATE_OP(XSAVE, buf, -1, -1, err); + clac(); + return err; } @@ -405,14 +401,12 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; - int err = 0; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault(err) - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err) - : "memory"); /* memory required? */ + int err; + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + return err; } diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 7358e9d61f1e..0e970d00dfcd 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,7 +5,7 @@ #define _ASM_X86_FPU_SIGNAL_H #ifdef CONFIG_X86_64 -# include <asm/sigcontext32.h> +# include <uapi/asm/sigcontext.h> # include <asm/user32.h> struct ksignal; int ia32_setup_rt_frame(int sig, struct ksignal *ksig, diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index c49c5173158e..1c6f6ac52ad0 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -95,63 +95,122 @@ struct swregs_state { /* * List of XSAVE features Linux knows about: */ -enum xfeature_bit { - XSTATE_BIT_FP, - XSTATE_BIT_SSE, - XSTATE_BIT_YMM, - XSTATE_BIT_BNDREGS, - XSTATE_BIT_BNDCSR, - XSTATE_BIT_OPMASK, - XSTATE_BIT_ZMM_Hi256, - XSTATE_BIT_Hi16_ZMM, - - XFEATURES_NR_MAX, +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + /* + * Values above here are "legacy states". + * Those below are "extended states". + */ + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + + XFEATURE_MAX, }; -#define XSTATE_FP (1 << XSTATE_BIT_FP) -#define XSTATE_SSE (1 << XSTATE_BIT_SSE) -#define XSTATE_YMM (1 << XSTATE_BIT_YMM) -#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) -#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) -#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) -#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) -#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) + +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ + | XFEATURE_MASK_ZMM_Hi256 \ + | XFEATURE_MASK_Hi16_ZMM) + +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +struct reg_128_bit { + u8 regbytes[128/8]; +}; +struct reg_256_bit { + u8 regbytes[256/8]; +}; +struct reg_512_bit { + u8 regbytes[512/8]; +}; /* + * State component 2: + * * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) - * and are stored in 'struct fxregs_state::xmm_space[]'. + * and are stored in 'struct fxregs_state::xmm_space[]' in the + * "legacy" area. * - * The high 128 bits are stored here: - * 16x 128 bits == 256 bytes. + * The high 128 bits are stored here. */ struct ymmh_struct { - u8 ymmh_space[256]; -}; - -/* We don't support LWP yet: */ -struct lwp_struct { - u8 reserved[128]; -}; + struct reg_128_bit hi_ymm[16]; +} __packed; /* Intel MPX support: */ -struct bndreg { + +struct mpx_bndreg { u64 lower_bound; u64 upper_bound; } __packed; +/* + * State component 3 is used for the 4 128-bit bounds registers + */ +struct mpx_bndreg_state { + struct mpx_bndreg bndreg[4]; +} __packed; -struct bndcsr { +/* + * State component 4 is used for the 64-bit user-mode MPX + * configuration register BNDCFGU and the 64-bit MPX status + * register BNDSTATUS. We call the pair "BNDCSR". + */ +struct mpx_bndcsr { u64 bndcfgu; u64 bndstatus; } __packed; -struct mpx_struct { - struct bndreg bndreg[4]; - struct bndcsr bndcsr; -}; +/* + * The BNDCSR state is padded out to be 64-bytes in size. + */ +struct mpx_bndcsr_state { + union { + struct mpx_bndcsr bndcsr; + u8 pad_to_64_bytes[64]; + }; +} __packed; + +/* AVX-512 Components: */ + +/* + * State component 5 is used for the 8 64-bit opmask registers + * k0-k7 (opmask state). + */ +struct avx_512_opmask_state { + u64 opmask_reg[8]; +} __packed; + +/* + * State component 6 is used for the upper 256 bits of the + * registers ZMM0-ZMM15. These 16 256-bit values are denoted + * ZMM0_H-ZMM15_H (ZMM_Hi256 state). + */ +struct avx_512_zmm_uppers_state { + struct reg_256_bit zmm_upper[16]; +} __packed; + +/* + * State component 7 is used for the 16 512-bit registers + * ZMM16-ZMM31 (Hi16_ZMM state). + */ +struct avx_512_hi16_state { + struct reg_512_bit hi16_zmm[16]; +} __packed; struct xstate_header { u64 xfeatures; @@ -159,22 +218,19 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); -/* New processor state extensions should be added here: */ -#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ - sizeof(struct lwp_struct) + \ - sizeof(struct mpx_struct) ) /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. * * It consists of a legacy fxregs portion, an xstate header and - * subsequent fixed size areas as defined by the xstate header. - * Not all CPUs support all the extensions. + * subsequent areas as defined by the xstate header. Not all CPUs + * support all the extensions, so the size of the extended area + * can vary quite a bit between CPUs. */ struct xregs_state { struct fxregs_state i387; struct xstate_header header; - u8 __reserved[XSTATE_RESERVE]; + u8 extended_state_area[0]; } __attribute__ ((packed, aligned (64))); /* @@ -182,7 +238,9 @@ struct xregs_state { * put together, so that we can pick the right one runtime. * * The size of the structure is determined by the largest - * member - which is the xsave area: + * member - which is the xsave area. The padding is there + * to ensure that statically-allocated task_structs (just + * the init_task today) have enough space. */ union fpregs_state { struct fregs_state fsave; diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4656b25bb9a7..af30fdeb140d 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -6,7 +6,7 @@ #include <linux/uaccess.h> /* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) +#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) #define XSTATE_CPUID 0x0000000d @@ -19,14 +19,19 @@ #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE) /* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) +#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) /* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) +#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " @@ -40,6 +45,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); +void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 04e9d023168f..1c0b43724ce3 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); -struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 5fa9fb0f8809..cc285ec4b2c1 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,10 +63,10 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; -extern int boot_hpet_disable; +extern bool boot_hpet_disable; extern u8 hpet_blockid; -extern int hpet_force_user; -extern u8 hpet_msi_disable; +extern bool hpet_force_user; +extern bool hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1e3408e88604..1815b736269d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -130,6 +130,11 @@ struct irq_alloc_info { char *uv_name; }; #endif +#if IS_ENABLED(CONFIG_VMD) + struct { + struct msi_desc *desc; + }; +#endif }; }; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index ccffa53750a8..39bcefc20de7 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,6 +60,7 @@ struct legacy_pic { void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); + int (*probe)(void); int (*irq_pending)(unsigned int irq); void (*make_irq)(unsigned int irq); }; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 28019765442e..a9bdf5569ab3 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -10,7 +10,7 @@ * 32 bit structures for IA32 support. */ -#include <asm/sigcontext32.h> +#include <uapi/asm/sigcontext.h> /* signal.h */ @@ -18,7 +18,7 @@ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; - struct sigcontext_ia32 uc_mcontext; + struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h new file mode 100644 index 000000000000..e1a411786bf5 --- /dev/null +++ b/arch/x86/include/asm/intel_pt.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_INTEL_PT_H +#define _ASM_X86_INTEL_PT_H + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) +void cpu_emergency_stop_pt(void); +#else +static inline void cpu_emergency_stop_pt(void) {} +#endif + +#endif /* _ASM_X86_INTEL_PT_H */ diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h new file mode 100644 index 000000000000..201eb9dce595 --- /dev/null +++ b/arch/x86/include/asm/intel_punit_ipc.h @@ -0,0 +1,101 @@ +#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_ +#define _ASM_X86_INTEL_PUNIT_IPC_H_ + +/* + * Three types of 8bit P-Unit IPC commands are supported, + * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD. + */ +typedef enum { + BIOS_IPC = 0, + GTDRIVER_IPC, + ISPDRIVER_IPC, + RESERVED_IPC, +} IPC_TYPE; + +#define IPC_TYPE_OFFSET 6 +#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET) + +/* BIOS => Pcode commands */ +#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00) +#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01) +#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02) +#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03) +#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04) +#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05) +#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06) +#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07) +#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08) +#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09) +#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a) +#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b) +#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c) +#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d) +#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e) +#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f) +#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10) +#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11) +#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12) +#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13) +#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14) +#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15) +#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16) +#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17) +#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18) +#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19) +#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a) +#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b) + +/* GT Driver => Pcode commands */ +#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00) +#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01) +#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02) +#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03) +#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06) +#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07) +#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16) +#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17) +#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a) +#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c) + +/* ISP Driver => Pcode commands */ +#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00) +#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01) +#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02) +#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03) +#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04) +#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05) + +/* Error codes */ +#define IPC_PUNIT_ERR_SUCCESS 0 +#define IPC_PUNIT_ERR_INVALID_CMD 1 +#define IPC_PUNIT_ERR_INVALID_PARAMETER 2 +#define IPC_PUNIT_ERR_CMD_TIMEOUT 3 +#define IPC_PUNIT_ERR_CMD_LOCKED 4 +#define IPC_PUNIT_ERR_INVALID_VR_ID 5 +#define IPC_PUNIT_ERR_VR_ERR 6 + +#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC) + +int intel_punit_ipc_simple_command(int cmd, int para1, int para2); +int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out); + +#else + +static inline int intel_punit_ipc_simple_command(int cmd, + int para1, int para2) +{ + return -ENODEV; +} + +static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, + u32 *in, u32 *out) +{ + return -ENODEV; +} + +#endif /* CONFIG_INTEL_PUNIT_IPC */ + +#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h new file mode 100644 index 000000000000..ed65fe701de5 --- /dev/null +++ b/arch/x86/include/asm/intel_telemetry.h @@ -0,0 +1,147 @@ +/* + * Intel SOC Telemetry Driver Header File + * Copyright (C) 2015, Intel Corporation. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#ifndef INTEL_TELEMETRY_H +#define INTEL_TELEMETRY_H + +#define TELEM_MAX_EVENTS_SRAM 28 +#define TELEM_MAX_OS_ALLOCATED_EVENTS 20 + +enum telemetry_unit { + TELEM_PSS = 0, + TELEM_IOSS, + TELEM_UNIT_NONE +}; + +struct telemetry_evtlog { + u32 telem_evtid; + u64 telem_evtlog; +}; + +struct telemetry_evtconfig { + /* Array of Event-IDs to Enable */ + u32 *evtmap; + + /* Number of Events (<29) in evtmap */ + u8 num_evts; + + /* Sampling period */ + u8 period; +}; + +struct telemetry_evtmap { + const char *name; + u32 evt_id; +}; + +struct telemetry_unit_config { + struct telemetry_evtmap *telem_evts; + void __iomem *regmap; + u32 ssram_base_addr; + u8 ssram_evts_used; + u8 curr_period; + u8 max_period; + u8 min_period; + u32 ssram_size; + +}; + +struct telemetry_plt_config { + struct telemetry_unit_config pss_config; + struct telemetry_unit_config ioss_config; + struct mutex telem_trace_lock; + struct mutex telem_lock; + bool telem_in_use; +}; + +struct telemetry_core_ops { + int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period, + u8 *ioss_min_period, u8 *ioss_max_period); + + int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig, + struct telemetry_evtconfig *ioss_evtconfig, + int pss_len, int ioss_len); + + int (*update_events)(struct telemetry_evtconfig pss_evtconfig, + struct telemetry_evtconfig ioss_evtconfig); + + int (*set_sampling_period)(u8 pss_period, u8 ioss_period); + + int (*get_trace_verbosity)(enum telemetry_unit telem_unit, + u32 *verbosity); + + int (*set_trace_verbosity)(enum telemetry_unit telem_unit, + u32 verbosity); + + int (*raw_read_eventlog)(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, + int len, int log_all_evts); + + int (*read_eventlog)(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, + int len, int log_all_evts); + + int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts, + u32 *pss_evtmap, u32 *ioss_evtmap); + + int (*reset_events)(void); +}; + +int telemetry_set_pltdata(struct telemetry_core_ops *ops, + struct telemetry_plt_config *pltconfig); + +int telemetry_clear_pltdata(void); + +int telemetry_pltconfig_valid(void); + +int telemetry_get_evtname(enum telemetry_unit telem_unit, + const char **name, int len); + +int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig, + struct telemetry_evtconfig ioss_evtconfig); + +int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts, + u32 *pss_evtmap, u32 *ioss_evtmap); + +int telemetry_reset_events(void); + +int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config, + struct telemetry_evtconfig *ioss_config, + int pss_len, int ioss_len); + +int telemetry_read_events(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_raw_read_events(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_read_eventlog(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period, + u8 *ioss_min_period, u8 *ioss_max_period); + +int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period); + +int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit, + u32 verbosity); + +int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit, + u32 *verbosity); + +#endif /* INTEL_TELEMETRY_H */ diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index b72ad0faa6c5..b41ee164930a 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -1,5 +1,5 @@ /* - * iosf_mbi.h: Intel OnChip System Fabric MailBox access support + * Intel OnChip System Fabric MailBox access support */ #ifndef IOSF_MBI_SYMS_H @@ -16,6 +16,18 @@ #define MBI_MASK_LO 0x000000FF #define MBI_ENABLE 0xF0 +/* IOSF SB read/write opcodes */ +#define MBI_MMIO_READ 0x00 +#define MBI_MMIO_WRITE 0x01 +#define MBI_CFG_READ 0x04 +#define MBI_CFG_WRITE 0x05 +#define MBI_CR_READ 0x06 +#define MBI_CR_WRITE 0x07 +#define MBI_REG_READ 0x10 +#define MBI_REG_WRITE 0x11 +#define MBI_ESRAM_READ 0x12 +#define MBI_ESRAM_WRITE 0x13 + /* Baytrail available units */ #define BT_MBI_UNIT_AUNIT 0x00 #define BT_MBI_UNIT_SMC 0x01 @@ -28,50 +40,13 @@ #define BT_MBI_UNIT_SATA 0xA3 #define BT_MBI_UNIT_PCIE 0xA6 -/* Baytrail read/write opcodes */ -#define BT_MBI_AUNIT_READ 0x10 -#define BT_MBI_AUNIT_WRITE 0x11 -#define BT_MBI_SMC_READ 0x10 -#define BT_MBI_SMC_WRITE 0x11 -#define BT_MBI_CPU_READ 0x10 -#define BT_MBI_CPU_WRITE 0x11 -#define BT_MBI_BUNIT_READ 0x10 -#define BT_MBI_BUNIT_WRITE 0x11 -#define BT_MBI_PMC_READ 0x06 -#define BT_MBI_PMC_WRITE 0x07 -#define BT_MBI_GFX_READ 0x00 -#define BT_MBI_GFX_WRITE 0x01 -#define BT_MBI_SMIO_READ 0x06 -#define BT_MBI_SMIO_WRITE 0x07 -#define BT_MBI_USB_READ 0x06 -#define BT_MBI_USB_WRITE 0x07 -#define BT_MBI_SATA_READ 0x00 -#define BT_MBI_SATA_WRITE 0x01 -#define BT_MBI_PCIE_READ 0x00 -#define BT_MBI_PCIE_WRITE 0x01 - /* Quark available units */ #define QRK_MBI_UNIT_HBA 0x00 #define QRK_MBI_UNIT_HB 0x03 #define QRK_MBI_UNIT_RMU 0x04 #define QRK_MBI_UNIT_MM 0x05 -#define QRK_MBI_UNIT_MMESRAM 0x05 #define QRK_MBI_UNIT_SOC 0x31 -/* Quark read/write opcodes */ -#define QRK_MBI_HBA_READ 0x10 -#define QRK_MBI_HBA_WRITE 0x11 -#define QRK_MBI_HB_READ 0x10 -#define QRK_MBI_HB_WRITE 0x11 -#define QRK_MBI_RMU_READ 0x10 -#define QRK_MBI_RMU_WRITE 0x11 -#define QRK_MBI_MM_READ 0x10 -#define QRK_MBI_MM_WRITE 0x11 -#define QRK_MBI_MMESRAM_READ 0x12 -#define QRK_MBI_MMESRAM_WRITE 0x13 -#define QRK_MBI_SOC_READ 0x06 -#define QRK_MBI_SOC_WRITE 0x07 - #if IS_ENABLED(CONFIG_IOSF_MBI) bool iosf_mbi_available(void); diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 615fa9061b57..cfc9a0d2d07c 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -119,6 +119,8 @@ static inline void native_apic_mem_write(APIC_ICR, cfg); } +extern void default_send_IPI_single(int cpu, int vector); +extern void default_send_IPI_single_phys(int cpu, int vector); extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 881b4768644a..e7de5c9a4fbd 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -23,11 +23,13 @@ extern void irq_ctx_init(int cpu); #define __ARCH_HAS_DO_SOFTIRQ +struct irq_desc; + #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -extern void irq_force_complete_move(int); +extern void irq_force_complete_move(struct irq_desc *desc); #endif #ifdef CONFIG_HAVE_KVM @@ -37,7 +39,6 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -struct irq_desc; extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 046c7fb1ca43..a210eba2727c 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #ifdef CONFIG_IRQ_REMAP extern bool irq_remapping_cap(enum irq_remap_cap cap); @@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } -struct vcpu_data { - u64 pi_desc_addr; /* Physical address of PI Descriptor */ - u32 vector; /* Guest vector of the interrupt */ -}; - #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 5daeca3d0f9e..adc54c12cbd1 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -1,12 +1,18 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifndef __ASSEMBLY__ - -#include <linux/stringify.h> -#include <linux/types.h> -#include <asm/nops.h> -#include <asm/asm.h> +#ifndef HAVE_JUMP_LABEL +/* + * For better or for worse, if jump labels (the gcc extension) are missing, + * then the entire static branch patching infrastructure is compiled out. + * If that happens, the code in here will malfunction. Raise a compiler + * error instead. + * + * In theory, jump labels and the static branch patching infrastructure + * could be decoupled to fix this. + */ +#error asm/jump_label.h included on a non-jump-label kernel +#endif #define JUMP_LABEL_NOP_SIZE 5 @@ -16,6 +22,14 @@ # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif +#include <asm/asm.h> +#include <asm/nops.h> + +#ifndef __ASSEMBLY__ + +#include <linux/stringify.h> +#include <linux/types.h> + static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" @@ -59,5 +73,40 @@ struct jump_entry { jump_label_t key; }; -#endif /* __ASSEMBLY__ */ +#else /* __ASSEMBLY__ */ + +.macro STATIC_JUMP_IF_TRUE target, key, def +.Lstatic_jump_\@: + .if \def + /* Equivalent to "jmp.d32 \target" */ + .byte 0xe9 + .long \target - .Lstatic_jump_after_\@ +.Lstatic_jump_after_\@: + .else + .byte STATIC_KEY_INIT_NOP + .endif + .pushsection __jump_table, "aw" + _ASM_ALIGN + _ASM_PTR .Lstatic_jump_\@, \target, \key + .popsection +.endm + +.macro STATIC_JUMP_IF_FALSE target, key, def +.Lstatic_jump_\@: + .if \def + .byte STATIC_KEY_INIT_NOP + .else + /* Equivalent to "jmp.d32 \target" */ + .byte 0xe9 + .long \target - .Lstatic_jump_after_\@ +.Lstatic_jump_after_\@: + .endif + .pushsection __jump_table, "aw" + _ASM_ALIGN + _ASM_PTR .Lstatic_jump_\@, \target, \key + 1 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index b130d59406fb..e5f5dc9787d5 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -29,11 +29,5 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs, extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC_CORE -extern int in_crash_kexec; -#else -/* no crash dump is ever in progress if no crash kernel can be kexec'd */ -#define in_crash_kexec 0 -#endif #endif /* _ASM_X86_KDEBUG_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e16466ec473c..e9cd7befcb76 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -112,6 +112,16 @@ struct x86_emulate_ops { struct x86_exception *fault); /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3a36ee704c30..44adbb819041 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,8 @@ #include <linux/perf_event.h> #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> +#include <linux/irqbypass.h> +#include <linux/hyperv.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -44,6 +46,31 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS +/* x86-specific vcpu->requests bit members */ +#define KVM_REQ_MIGRATE_TIMER 8 +#define KVM_REQ_REPORT_TPR_ACCESS 9 +#define KVM_REQ_TRIPLE_FAULT 10 +#define KVM_REQ_MMU_SYNC 11 +#define KVM_REQ_CLOCK_UPDATE 12 +#define KVM_REQ_DEACTIVATE_FPU 13 +#define KVM_REQ_EVENT 14 +#define KVM_REQ_APF_HALT 15 +#define KVM_REQ_STEAL_UPDATE 16 +#define KVM_REQ_NMI 17 +#define KVM_REQ_PMU 18 +#define KVM_REQ_PMI 19 +#define KVM_REQ_SMI 20 +#define KVM_REQ_MASTERCLOCK_UPDATE 21 +#define KVM_REQ_MCLOCK_INPROGRESS 22 +#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 +#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_HV_CRASH 26 +#define KVM_REQ_IOAPIC_EOI_EXIT 27 +#define KVM_REQ_HV_RESET 28 +#define KVM_REQ_HV_EXIT 29 +#define KVM_REQ_HV_STIMER 30 + #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ @@ -176,6 +203,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -210,6 +239,10 @@ union kvm_mmu_page_role { }; }; +struct kvm_rmap_head { + unsigned long val; +}; + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -227,7 +260,7 @@ struct kvm_mmu_page { bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; - unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ unsigned long mmu_valid_gen; @@ -371,9 +404,38 @@ struct kvm_mtrr { struct list_head head; }; +/* Hyper-V SynIC timer */ +struct kvm_vcpu_hv_stimer { + struct hrtimer timer; + int index; + u64 config; + u64 count; + u64 exp_time; + struct hv_message msg; + bool msg_pending; +}; + +/* Hyper-V synthetic interrupt controller (SynIC)*/ +struct kvm_vcpu_hv_synic { + u64 version; + u64 control; + u64 msg_page; + u64 evt_page; + atomic64_t sint[HV_SYNIC_SINT_COUNT]; + atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT]; + DECLARE_BITMAP(auto_eoi_bitmap, 256); + DECLARE_BITMAP(vec_bitmap, 256); + bool active; +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; + s64 runtime_offset; + struct kvm_vcpu_hv_synic synic; + struct kvm_hyperv_exit exit; + struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; + DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); }; struct kvm_vcpu_arch { @@ -396,6 +458,8 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + bool apicv_active; + DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -500,6 +564,7 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -573,6 +638,9 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { @@ -580,7 +648,7 @@ struct kvm_lpage_info { }; struct kvm_arch_memory_slot { - unsigned long *rmap[KVM_NR_PAGE_SIZES]; + struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; @@ -683,6 +751,9 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + + bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { @@ -766,7 +837,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -819,7 +890,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + bool (*get_enable_apicv)(void); + void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); @@ -833,7 +905,7 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); + void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -841,11 +913,9 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -887,6 +957,20 @@ struct kvm_x86_ops { gfn_t offset, unsigned long mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { @@ -898,17 +982,6 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; -static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, - s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); -} - -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); -} - int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -961,10 +1034,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; -/* minimum supported tsc_khz for guests */ -extern u32 kvm_min_guest_tsc_khz; /* maximum supported tsc_khz for guests */ extern u32 kvm_max_guest_tsc_khz; +/* number of bits of the fractional part of the TSC scaling ratio */ +extern u8 kvm_tsc_scaling_ratio_frac_bits; +/* maximum allowed value of TSC scaling ratio */ +extern u64 kvm_max_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -1071,6 +1146,8 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); +void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, @@ -1210,9 +1287,15 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); + unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); +void kvm_make_mclock_inprogress_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request(struct kvm *kvm); + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, @@ -1231,4 +1314,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); + +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 3bbc07a57a31..73d0c9b92087 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -12,7 +12,9 @@ #define GUEST_PL 1 /* Page for Switcher text itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) +#define SWITCHER_TEXT_PAGES (1) +#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) +#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) /* Where we map the Switcher, in both Host and Guest. */ extern unsigned long switcher_addr; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2dbc0bf2b9f3..2ea4527e462f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -123,19 +123,27 @@ struct mca_config { }; struct mce_vendor_flags { - /* - * overflow recovery cpuid bit indicates that overflow - * conditions are not fatal - */ - __u64 overflow_recov : 1, - - /* - * SUCCOR stands for S/W UnCorrectable error COntainment - * and Recovery. It indicates support for data poisoning - * in HW and deferred error interrupts. - */ - succor : 1, - __reserved_0 : 62; + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; }; extern struct mce_vendor_flags mce_flags; diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 9e6278c7140e..1e1b07a5a738 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#include <asm/cpu.h> #include <linux/earlycpio.h> #define native_rdmsr(msr, val1, val2) \ @@ -27,7 +28,6 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; -extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, @@ -55,6 +55,12 @@ struct ucode_cpu_info { }; extern struct ucode_cpu_info ucode_cpu_info[]; +#ifdef CONFIG_MICROCODE +int __init microcode_init(void); +#else +static inline int __init microcode_init(void) { return 0; }; +#endif + #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); #else @@ -75,7 +81,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void) static inline void __exit exit_amd_microcode(void) {} #endif -#ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) @@ -91,14 +96,14 @@ static inline void __exit exit_amd_microcode(void) {} /* * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_vendor() gets vendor id for BSP. + * x86_cpuid_vendor() gets vendor id for BSP. * * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_vendor() to get vendor id for AP. + * coding, we still use x86_cpuid_vendor() to get vendor id for AP. * - * x86_vendor() gets vendor information directly from CPUID. + * x86_cpuid_vendor() gets vendor information directly from CPUID. */ -static inline int x86_vendor(void) +static inline int x86_cpuid_vendor(void) { u32 eax = 0x00000000; u32 ebx, ecx = 0, edx; @@ -114,58 +119,28 @@ static inline int x86_vendor(void) return X86_VENDOR_UNKNOWN; } -static inline unsigned int __x86_family(unsigned int sig) -{ - unsigned int x86; - - x86 = (sig >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (sig >> 20) & 0xff; - - return x86; -} - -static inline unsigned int x86_family(void) +static inline unsigned int x86_cpuid_family(void) { u32 eax = 0x00000001; u32 ebx, ecx = 0, edx; native_cpuid(&eax, &ebx, &ecx, &edx); - return __x86_family(eax); -} - -static inline unsigned int x86_model(unsigned int sig) -{ - unsigned int x86, model; - - x86 = __x86_family(sig); - - model = (sig >> 4) & 0xf; - - if (x86 == 0x6 || x86 == 0xf) - model += ((sig >> 16) & 0xf) << 4; - - return model; + return x86_family(eax); } +#ifdef CONFIG_MICROCODE extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else -static inline void __init load_ucode_bsp(void) {} -static inline void load_ucode_ap(void) {} -static inline int __init save_microcode_in_initrd(void) -{ - return 0; -} -static inline void reload_early_microcode(void) {} -static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ - return false; -} +static inline void __init load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline int __init save_microcode_in_initrd(void) { return 0; } +static inline void reload_early_microcode(void) { } +static inline bool +get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index ac6d328977a6..adfc847a395e 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -64,7 +64,7 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; -#ifdef CONFIG_MICROCODE_AMD_EARLY +#ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); @@ -76,4 +76,5 @@ static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} #endif +extern bool check_current_patch_level(u32 *rev, bool early); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 7991c606125d..8559b0102ea1 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -57,7 +57,7 @@ extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); extern int find_matching_signature(void *mc, unsigned int csig, int cpf); -#ifdef CONFIG_MICROCODE_INTEL_EARLY +#ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); @@ -71,13 +71,9 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; static inline void reload_ucode_intel(void) {} #endif -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +#ifdef CONFIG_HOTPLUG_CPU extern int save_mc_for_early(u8 *mc); #else -static inline int save_mc_for_early(u8 *mc) -{ - return 0; -} +static inline int save_mc_for_early(u8 *mc) { return 0; } #endif - #endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 379cd3658799..bfd9b2a35a0b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif cpumask_set_cpu(cpu, mm_cpumask(next)); - /* Re-load page tables */ + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ @@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); + /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index 93724cc62177..eb4b09b41df5 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -1,7 +1,13 @@ #ifndef _ASM_X86_MSI_H #define _ASM_X86_MSI_H #include <asm/hw_irq.h> +#include <asm/irqdomain.h> typedef struct irq_alloc_info msi_alloc_info_t; +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg); + +void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc); + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8c14bb7fc8f..b05402ef3b84 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -35,7 +35,7 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd -#define MSR_NHM_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -44,7 +44,6 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) -#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e @@ -206,6 +205,13 @@ #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f @@ -315,6 +321,7 @@ #define MSR_F15H_PERF_CTR 0xc0010201 #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 +#define MSR_F15H_IC_CFG 0xc0011021 /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h new file mode 100644 index 000000000000..7567225747d8 --- /dev/null +++ b/arch/x86/include/asm/msr-trace.h @@ -0,0 +1,57 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM msr + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE msr-trace + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/ + +#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MSR_H + +#include <linux/tracepoint.h> + +/* + * Tracing for x86 model specific registers. Directly maps to the + * RDMSR/WRMSR instructions. + */ + +DECLARE_EVENT_CLASS(msr_trace_class, + TP_PROTO(unsigned msr, u64 val, int failed), + TP_ARGS(msr, val, failed), + TP_STRUCT__entry( + __field( unsigned, msr ) + __field( u64, val ) + __field( int, failed ) + ), + TP_fast_assign( + __entry->msr = msr; + __entry->val = val; + __entry->failed = failed; + ), + TP_printk("%x, value %llx%s", + __entry->msr, + __entry->val, + __entry->failed ? " #GP" : "") +); + +DEFINE_EVENT(msr_trace_class, read_msr, + TP_PROTO(unsigned msr, u64 val, int failed), + TP_ARGS(msr, val, failed) +); + +DEFINE_EVENT(msr_trace_class, write_msr, + TP_PROTO(unsigned msr, u64 val, int failed), + TP_ARGS(msr, val, failed) +); + +DEFINE_EVENT(msr_trace_class, rdpmc, + TP_PROTO(unsigned msr, u64 val, int failed), + TP_ARGS(msr, val, failed) +); + +#endif /* _TRACE_MSR_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 77d8b284e4a7..93fb7c1cffda 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -32,6 +32,16 @@ struct msr_regs_info { int err; }; +struct saved_msr { + bool valid; + struct msr_info info; +}; + +struct saved_msrs { + unsigned int num; + struct saved_msr *array; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; @@ -57,11 +67,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif +#ifdef CONFIG_TRACEPOINTS +/* + * Be very careful with includes. This header is prone to include loops. + */ +#include <asm/atomic.h> +#include <linux/tracepoint-defs.h> + +extern struct tracepoint __tracepoint_read_msr; +extern struct tracepoint __tracepoint_write_msr; +extern struct tracepoint __tracepoint_rdpmc; +#define msr_tracepoint_active(t) static_key_false(&(t).key) +extern void do_trace_write_msr(unsigned msr, u64 val, int failed); +extern void do_trace_read_msr(unsigned msr, u64 val, int failed); +extern void do_trace_rdpmc(unsigned msr, u64 val, int failed); +#else +#define msr_tracepoint_active(t) false +static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {} +static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {} +#endif + static inline unsigned long long native_read_msr(unsigned int msr) { DECLARE_ARGS(val, low, high); asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr)); + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } @@ -78,6 +111,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr), [fault] "i" (-EIO)); + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); } @@ -85,6 +120,8 @@ static inline void native_write_msr(unsigned int msr, unsigned low, unsigned high) { asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } /* Can be uninlined because referenced by paravirt */ @@ -102,6 +139,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, : "c" (msr), "0" (low), "d" (high), [fault] "i" (-EIO) : "memory"); + if (msr_tracepoint_active(__tracepoint_read_msr)) + do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } @@ -160,6 +199,8 @@ static inline unsigned long long native_read_pmc(int counter) DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + if (msr_tracepoint_active(__tracepoint_rdpmc)) + do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } @@ -190,7 +231,7 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) static inline void wrmsrl(unsigned msr, u64 val) { - native_write_msr(msr, (u32)val, (u32)(val >> 32)); + native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h index 1c6f7f6212c1..c64373a2d731 100644 --- a/arch/x86/include/asm/numachip/numachip.h +++ b/arch/x86/include/asm/numachip/numachip.h @@ -14,6 +14,7 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_H #define _ASM_X86_NUMACHIP_NUMACHIP_H +extern u8 numachip_system; extern int __init pci_numachip_init(void); #endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h index 660f843df928..29719eecdc2e 100644 --- a/arch/x86/include/asm/numachip/numachip_csr.h +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -14,12 +14,8 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H #define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H -#include <linux/numa.h> -#include <linux/percpu.h> +#include <linux/smp.h> #include <linux/io.h> -#include <linux/swab.h> -#include <asm/types.h> -#include <asm/processor.h> #define CSR_NODE_SHIFT 16 #define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) @@ -27,11 +23,8 @@ /* 32K CSR space, b15 indicates geo/non-geo */ #define CSR_OFFSET_MASK 0x7fffUL - -/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ -#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL -#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL -#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) +#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) +#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) /* * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however @@ -41,12 +34,7 @@ #define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL #define NUMACHIP_LCSR_LIM 0x3fffffffffffULL #define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) - -static inline void *gcsr_address(int node, unsigned long offset) -{ - return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | - CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); -} +#define NUMACHIP_LAPIC_BITS 8 static inline void *lcsr_address(unsigned long offset) { @@ -54,114 +42,57 @@ static inline void *lcsr_address(unsigned long offset) CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); } -static inline unsigned int read_gcsr(int node, unsigned long offset) +static inline unsigned int read_lcsr(unsigned long offset) { - return swab32(readl(gcsr_address(node, offset))); + return swab32(readl(lcsr_address(offset))); } -static inline void write_gcsr(int node, unsigned long offset, unsigned int val) +static inline void write_lcsr(unsigned long offset, unsigned int val) { - writel(swab32(val), gcsr_address(node, offset)); + writel(swab32(val), lcsr_address(offset)); } -static inline unsigned int read_lcsr(unsigned long offset) +/* + * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G + */ + +#define NUMACHIP2_LCSR_BASE 0xf0000000UL +#define NUMACHIP2_LCSR_SIZE 0x1000000UL +#define NUMACHIP2_APIC_ICR 0x100000 +#define NUMACHIP2_TIMER_DEADLINE 0x200000 +#define NUMACHIP2_TIMER_INT 0x200008 +#define NUMACHIP2_TIMER_NOW 0x200018 +#define NUMACHIP2_TIMER_RESET 0x200020 + +static inline void __iomem *numachip2_lcsr_address(unsigned long offset) { - return swab32(readl(lcsr_address(offset))); + return (void __iomem *)__va(NUMACHIP2_LCSR_BASE | + (offset & (NUMACHIP2_LCSR_SIZE - 1))); } -static inline void write_lcsr(unsigned long offset, unsigned int val) +static inline u32 numachip2_read32_lcsr(unsigned long offset) { - writel(swab32(val), lcsr_address(offset)); + return readl(numachip2_lcsr_address(offset)); } -/* ========================================================================= */ -/* CSR_G0_STATE_CLEAR */ -/* ========================================================================= */ - -#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) -union numachip_csr_g0_state_clear { - unsigned int v; - struct numachip_csr_g0_state_clear_s { - unsigned int _state:2; - unsigned int _rsvd_2_6:5; - unsigned int _lost:1; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G0_NODE_IDS */ -/* ========================================================================= */ +static inline u64 numachip2_read64_lcsr(unsigned long offset) +{ + return readq(numachip2_lcsr_address(offset)); +} -#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) -union numachip_csr_g0_node_ids { - unsigned int v; - struct numachip_csr_g0_node_ids_s { - unsigned int _initialid:16; - unsigned int _nodeid:12; - unsigned int _rsvd_28_31:4; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_GEN */ -/* ========================================================================= */ +static inline void numachip2_write32_lcsr(unsigned long offset, u32 val) +{ + writel(val, numachip2_lcsr_address(offset)); +} -#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) -union numachip_csr_g3_ext_irq_gen { - unsigned int v; - struct numachip_csr_g3_ext_irq_gen_s { - unsigned int _vector:8; - unsigned int _msgtype:3; - unsigned int _index:5; - unsigned int _destination_apic_id:16; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_STATUS */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) -union numachip_csr_g3_ext_irq_status { - unsigned int v; - struct numachip_csr_g3_ext_irq_status_s { - unsigned int _result:32; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_DEST */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) -union numachip_csr_g3_ext_irq_dest { - unsigned int v; - struct numachip_csr_g3_ext_irq_dest_s { - unsigned int _irq:8; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) -union numachip_csr_g3_nc_att_map_select { - unsigned int v; - struct numachip_csr_g3_nc_att_map_select_s { - unsigned int _upper_address_bits:4; - unsigned int _select_ram:4; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) +static inline void numachip2_write64_lcsr(unsigned long offset, u64 val) +{ + writeq(val, numachip2_lcsr_address(offset)); +} -#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ +static inline unsigned int numachip2_timer(void) +{ + return (smp_processor_id() % 48) << 6; +} +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4edd53b79a81..4928cf0d5af0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -26,9 +26,6 @@ #define MCE_STACK 4 #define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - /* * Set __PAGE_OFFSET to the most negative possible address + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c7c712f2648b..7bd0099384ca 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -5,20 +5,25 @@ #include <linux/types.h> /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_SHIFT 12 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast PAGE_MASK to a signed type so that it is sign-extended if +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) - -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 10d0596433f8..f6192502149e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,6 +19,12 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } +static inline int paravirt_has_feature(unsigned int feature) +{ + WARN_ON_ONCE(!pv_info.paravirt_enabled); + return (pv_info.features & feature); +} + static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -285,15 +291,6 @@ static inline void slow_down_io(void) #endif } -#ifdef CONFIG_SMP -static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, - unsigned long start_esp) -{ - PVOP_VCALL3(pv_apic_ops.startup_ipi_hook, - phys_apicid, start_eip, start_esp); -} -#endif - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { @@ -375,23 +372,6 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } -static inline void pmd_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); -} - -static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); -} - -static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); -} static inline pte_t __pte(pteval_t val) { @@ -922,23 +902,11 @@ extern void default_banner(void); call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -#define USERGS_SYSRET32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32)) - #ifdef CONFIG_X86_32 #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx - -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) - - #else /* !CONFIG_X86_32 */ /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5bff7c..77db5616a473 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,9 +70,14 @@ struct pv_info { #endif int paravirt_enabled; + unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; +#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) +/* Supported features */ +#define PV_SUPPORTED_RTC (1<<0) + struct pv_init_ops { /* * Patch may replace one of the defined code sequences with @@ -157,15 +162,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); -#ifdef CONFIG_X86_32 - /* - * Atomically enable interrupts and return to userspace. This - * is only used in 32-bit kernels. 64-bit kernels use - * usergs_sysret32 instead. - */ - void (*irq_enable_sysexit)(void); -#endif - /* * Switch to usermode gs and return to 64-bit usermode using * sysret. Only used in 64-bit kernels to return to 64-bit @@ -174,14 +170,6 @@ struct pv_cpu_ops { */ void (*usergs_sysret64)(void); - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); @@ -215,14 +203,6 @@ struct pv_irq_ops { #endif }; -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - struct pv_mmu_ops { unsigned long (*read_cr2)(void); void (*write_cr2)(unsigned long); @@ -274,12 +254,6 @@ struct pv_mmu_ops { pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - void (*pmd_update)(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp); - void (*pmd_update_defer)(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -354,7 +328,6 @@ struct paravirt_patch_template { struct pv_time_ops pv_time_ops; struct pv_cpu_ops pv_cpu_ops; struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; struct pv_mmu_ops pv_mmu_ops; struct pv_lock_ops pv_lock_ops; }; @@ -364,7 +337,6 @@ extern struct pv_init_ops pv_init_ops; extern struct pv_time_ops pv_time_ops; extern struct pv_cpu_ops pv_cpu_ops; extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; extern struct pv_mmu_ops pv_mmu_ops; extern struct pv_lock_ops pv_lock_ops; @@ -402,10 +374,8 @@ extern struct pv_lock_ops pv_lock_ops; __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) -unsigned paravirt_patch_nop(void); unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, unsigned long addr, u16 site_clobbers, diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195dae425..46873fbd44e1 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -151,11 +151,11 @@ extern struct list_head pci_mmcfg_list; #define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) /* - * AMD Fam10h CPUs are buggy, and cannot access MMIO config space - * on their northbrige except through the * %eax register. As such, you MUST - * NOT use normal IOMEM accesses, you need to only use the magic mmio-config - * accessor functions. - * In fact just use pci_config_*, nothing else please. + * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use + * %eax. No other source or target registers may be used. The following + * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's + * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1, + * "MMIO Configuration Coding Requirements". */ static inline unsigned char mmio_config_readb(void __iomem *pos) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 867da5bbb4a3..0687c4748b8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -19,6 +19,13 @@ #include <asm/x86_init.h> void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_checkwx(void); + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#else +#define debug_checkwx() do { } while (0) +#endif /* * ZERO_PAGE is a global shared page that is always zero: used @@ -62,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pmd_clear(pmd) native_pmd_clear(pmd) #define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) -#define pmd_update(mm, addr, ptep) do { } while (0) -#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -142,12 +146,12 @@ static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -158,20 +162,22 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _PAGE_PSE; + return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } static inline int has_transparent_hugepage(void) { return cpu_has_pse; } + +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pmd_devmap(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_DEVMAP); +} +#endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) @@ -248,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); +} + static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { pmdval_t v = native_pmd_val(pmd); @@ -267,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_ACCESSED); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_DIRTY); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); @@ -277,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DEVMAP); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); @@ -318,6 +339,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); +} + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -379,7 +410,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x)) +#define pmd_pgprot(x) __pgprot(pmd_flags(x)) +#define pud_pgprot(x) __pgprot(pud_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -446,6 +479,13 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t a) +{ + return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; +} +#endif + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { @@ -502,14 +542,15 @@ static inline int pmd_none(pmd_t pmd) static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); + return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pmd_page(pmd) \ + pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -570,14 +611,15 @@ static inline int pud_present(pud_t pud) static inline unsigned long pud_page_vaddr(pud_t pud) { - return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); + return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +#define pud_page(pud) \ + pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -710,14 +752,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, * updates should either be sets, clears, or set_pte_atomic for P->P * transitions, which means this hook should only be called for user PTEs. * This hook implies a P->P protection or access change has taken place, which - * requires a subsequent TLB flush. The notification can optionally be delayed - * until the TLB flush event by using the pte_update_defer form of the - * interface, but care must be taken to assure that the flush happens while - * still holding the same page table lock so that the shadow and primary pages - * do not become out of sync on SMP. + * requires a subsequent TLB flush. */ #define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) #endif /* @@ -795,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - #define __HAVE_ARCH_PMD_WRITE static inline int pmd_write(pmd_t pmd) { @@ -809,9 +842,7 @@ static inline int pmd_write(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = native_pmdp_get_and_clear(pmdp); - pmd_update(mm, addr, pmdp); - return pmd; + return native_pmdp_get_and_clear(pmdp); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -819,7 +850,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); - pmd_update(mm, addr, pmdp); } /* diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 13f310bfc09a..4432ab7f407c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,10 +22,11 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ @@ -46,7 +47,6 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) -#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK @@ -85,8 +85,11 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) +#define __HAVE_ARCH_PTE_DEVMAP #else #define _PAGE_NX (_AT(pteval_t, 0)) +#define _PAGE_DEVMAP (_AT(pteval_t, 0)) #endif #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) @@ -209,10 +212,10 @@ enum page_cache_mode { #include <linux/types.h> -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; @@ -276,14 +279,40 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline pudval_t pud_pfn_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return PHYSICAL_PUD_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pudval_t pud_flags_mask(pud_t pud) +{ + return ~pud_pfn_mask(pud); +} + static inline pudval_t pud_flags(pud_t pud) { - return native_pud_val(pud) & PTE_FLAGS_MASK; + return native_pud_val(pud) & pud_flags_mask(pud); +} + +static inline pmdval_t pmd_pfn_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return PHYSICAL_PMD_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pmdval_t pmd_flags_mask(pmd_t pmd) +{ + return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { - return native_pmd_val(pmd) & PTE_FLAGS_MASK; + return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) @@ -337,20 +366,18 @@ static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { + pgprotval_t val = pgprot_val(pgprot); pgprot_t new; - unsigned long val; - val = pgprot_val(pgprot); pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); return new; } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { + pgprotval_t val = pgprot_val(pgprot); pgprot_t new; - unsigned long val; - val = pgprot_val(pgprot); pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 7249e6d0902d..5973a2f3db3d 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -55,6 +55,7 @@ enum sst_audio_device_id_mrfld { PIPE_MEDIA0_IN = 0x8F, PIPE_MEDIA1_IN = 0x90, PIPE_MEDIA2_IN = 0x91, + PIPE_MEDIA3_IN = 0x9C, PIPE_RSVD = 0xFF, }; diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index d8ce3ec816ab..c57fd1ea9689 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void) } /** - * __arch_wb_cache_pmem - write back a cache range with CLWB + * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + * arch_wmb_pmem() call. */ -static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; + void *vaddr = (void __force *)addr; void *vend = vaddr + size; void *p; @@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, len = copy_from_iter_nocache(vaddr, bytes, i); if (__iter_needs_pmem_wb(i)) - __arch_wb_cache_pmem(vaddr, bytes); + arch_wb_cache_pmem(addr, bytes); return len; } @@ -132,13 +133,8 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { void *vaddr = (void __force *)addr; - /* TODO: implement the zeroing via non-temporal writes */ - if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) - clear_page(vaddr); - else - memset(vaddr, 0, size); - - __arch_wb_cache_pmem(vaddr, size); + memset(vaddr, 0, size); + arch_wb_cache_pmem(addr, size); } static inline bool __arch_has_wmb_pmem(void) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01bcde84d3e4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ -} while (0) +#define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ } while (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 19577dd325fa..20c11d1aa4cc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -11,7 +11,7 @@ struct vm86; #include <asm/math_emu.h> #include <asm/segment.h> #include <asm/types.h> -#include <asm/sigcontext.h> +#include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeature.h> #include <asm/page.h> @@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 +#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) @@ -556,12 +557,12 @@ static inline unsigned int cpuid_edx(unsigned int op) } /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } -static inline void cpu_relax(void) +static __always_inline void cpu_relax(void) { rep_nop(); } @@ -765,7 +766,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * Return saved PC of a blocked thread. * What is this good for? it will be always the scheduler or ret_from_fork. */ -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) +#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8)) #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7a6bed5c08bc..fdcc04020636 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,6 +4,15 @@ #include <linux/clocksource.h> #include <asm/pvclock-abi.h> +#ifdef CONFIG_KVM_GUEST +extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); @@ -91,10 +100,5 @@ struct pvclock_vsyscall_time_info { } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) -#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) - -int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, - int size); -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index b002e711ba88..9f92c180ed2f 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -1,6 +1,65 @@ #ifndef __ASM_QSPINLOCK_PARAVIRT_H #define __ASM_QSPINLOCK_PARAVIRT_H +/* + * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit + * registers. For i386, however, only 1 32-bit register needs to be saved + * and restored. So an optimized version of __pv_queued_spin_unlock() is + * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit. + */ +#ifdef CONFIG_64BIT + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); +#define __pv_queued_spin_unlock __pv_queued_spin_unlock +#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" +#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" + +/* + * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock + * which combines the registers saving trunk and the body of the following + * C code: + * + * void __pv_queued_spin_unlock(struct qspinlock *lock) + * { + * struct __qspinlock *l = (void *)lock; + * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + * + * if (likely(lockval == _Q_LOCKED_VAL)) + * return; + * pv_queued_spin_unlock_slowpath(lock, lockval); + * } + * + * For x86-64, + * rdi = lock (first argument) + * rsi = lockval (second argument) + * rdx = internal variable (set to 0) + */ +asm (".pushsection .text;" + ".globl " PV_UNLOCK ";" + ".align 4,0x90;" + PV_UNLOCK ": " + "push %rdx;" + "mov $0x1,%eax;" + "xor %edx,%edx;" + "lock cmpxchg %dl,(%rdi);" + "cmp $0x1,%al;" + "jne .slowpath;" + "pop %rdx;" + "ret;" + ".slowpath: " + "push %rsi;" + "movzbl %al,%esi;" + "call " PV_UNLOCK_SLOWPATH ";" + "pop %rsi;" + "pop %rdx;" + "ret;" + ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" + ".popsection"); + +#else /* CONFIG_64BIT */ + +extern void __pv_queued_spin_unlock(struct qspinlock *lock); PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); +#endif /* CONFIG_64BIT */ #endif diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index a82c4f1b4d83..2cb1cc253d51 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); +void run_crash_ipi_callback(struct pt_regs *regs); #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..e6cd2c489dbb 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -1,79 +1,8 @@ #ifndef _ASM_X86_SIGCONTEXT_H #define _ASM_X86_SIGCONTEXT_H -#include <uapi/asm/sigcontext.h> - -#ifdef __i386__ -struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long sp; - unsigned long bx; - unsigned long dx; - unsigned long cx; - unsigned long ax; - unsigned long trapno; - unsigned long err; - unsigned long ip; - unsigned short cs, __csh; - unsigned long flags; - unsigned long sp_at_signal; - unsigned short ss, __ssh; +/* This is a legacy header - all kernel code includes <uapi/asm/sigcontext.h> directly. */ - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long oldmask; - unsigned long cr2; -}; -#else /* __i386__ */ -struct sigcontext { - unsigned long r8; - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long bx; - unsigned long dx; - unsigned long ax; - unsigned long cx; - unsigned long sp; - unsigned long ip; - unsigned long flags; - unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; - unsigned long err; - unsigned long trapno; - unsigned long oldmask; - unsigned long cr2; +#include <uapi/asm/sigcontext.h> - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long reserved1[8]; -}; -#endif /* !__i386__ */ #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 1f3175bb994e..34edd1650bae 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H -#include <asm/sigcontext.h> +#include <uapi/asm/sigcontext.h> #include <asm/siginfo.h> #include <asm/ucontext.h> #include <linux/compat.h> @@ -9,8 +9,6 @@ #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext #else /* !CONFIG_X86_32 */ @@ -24,7 +22,7 @@ struct sigframe_ia32 { u32 pretcode; int sig; - struct sigcontext_ia32 sc; + struct sigcontext_32 sc; /* * fpstate is unused. fpstate is moved/allocated after * retcode[] below. This movement allows to have the FP state and the @@ -33,7 +31,7 @@ struct sigframe_ia32 { * the offset of extramask[] in the sigframe and thus prevent any * legacy application accessing/modifying it. */ - struct _fpstate_ia32 fpstate_unused; + struct _fpstate_32 fpstate_unused; #ifdef CONFIG_IA32_EMULATION unsigned int extramask[_COMPAT_NSIG_WORDS-1]; #else /* !CONFIG_IA32_EMULATION */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index c481be78fcf1..2138c9ae19ee 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -34,7 +34,7 @@ extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER -#include <asm/sigcontext.h> +#include <uapi/asm/sigcontext.h> #ifdef __i386__ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 222a6a3ca2b5..dfcf0727623b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -21,15 +21,6 @@ extern int smp_num_siblings; extern unsigned int num_processors; -static inline bool cpu_has_ht_siblings(void) -{ - bool has_siblings = false; -#ifdef CONFIG_SMP - has_siblings = cpu_has_ht && smp_num_siblings > 1; -#endif - return has_siblings; -} - DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); /* cpus sharing the last level cache: */ @@ -74,9 +65,6 @@ struct smp_ops { extern void set_cpu_sibling_map(int cpu); #ifdef CONFIG_SMP -#ifndef CONFIG_PARAVIRT -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0) -#endif extern struct smp_ops smp_ops; static inline void smp_send_stop(void) diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index d1793f06854d..8e9dbe7b73a1 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -15,6 +15,7 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; + struct saved_msrs saved_msrs; struct desc_ptr gdt_desc; struct desc_ptr idt; u16 ldt; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7ebf0ebe4e68..6136a18152af 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -24,6 +24,7 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; + struct saved_msrs saved_msrs; unsigned long efer; u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b78ac3..751bf4b7bf11 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d6a756ae04c8..999b7cd2e78c 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,21 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); extern const sys_call_ptr_t sys_call_table[]; +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..c7b551028740 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,9 +57,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int saved_preempt_count; mm_segment_t addr_limit; - void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ }; @@ -69,7 +67,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 173dd3ba108c..0f492fc50bce 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -11,7 +11,7 @@ TRACE_EVENT(mpx_bounds_register_exception, TP_PROTO(void *addr_referenced, - const struct bndreg *bndreg), + const struct mpx_bndreg *bndreg), TP_ARGS(addr_referenced, bndreg), TP_STRUCT__entry( @@ -44,7 +44,7 @@ TRACE_EVENT(mpx_bounds_register_exception, TRACE_EVENT(bounds_exception_mpx, - TP_PROTO(const struct bndcsr *bndcsr), + TP_PROTO(const struct mpx_bndcsr *bndcsr), TP_ARGS(bndcsr), TP_STRUCT__entry( @@ -116,7 +116,8 @@ TRACE_EVENT(mpx_new_bounds_table, /* * This gets used outside of MPX-specific code, so we need a stub. */ -static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr) +static inline +void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr) { } diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a8df874f3e88..a4a30e4b2d34 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * limit, not add it to the address). */ if (__builtin_constant_p(size)) - return addr > limit - size; + return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; - if (addr < size) + if (unlikely(addr < size)) return true; - return addr > limit; + return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ @@ -134,6 +134,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -182,7 +185,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ - __ret_gu; \ + __builtin_expect(__ret_gu, 0); \ }) #define __put_user_x(size, x, ptr, __ret_pu) \ @@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -278,7 +281,7 @@ extern void __put_user_8(void); __put_user_x(X, __pu_val, ptr, __ret_pu); \ break; \ } \ - __ret_pu; \ + __builtin_expect(__ret_pu, 0); \ }) #define __put_user_size(x, ptr, size, retval, errret) \ @@ -304,6 +307,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -358,9 +365,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -370,6 +377,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -400,17 +411,21 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ - __pu_err; \ + __uaccess_end(); \ + __builtin_expect(__pu_err, 0); \ }) #define __get_user_nocheck(x, ptr, size) \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) /* FIXME: this hack is definitely wrong -AK */ @@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) @@ -745,5 +762,39 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #undef __copy_from_user_overflow #undef __copy_to_user_overflow +/* + * We rely on the nested NMI work to allow atomic faults from the NMI path; the + * nested NMI paths are careful to preserve CR2. + * + * Caller must use pagefault_enable/disable, or run in interrupt context, + * and also do a uaccess_ok() check + */ +#define __copy_from_user_nmi __copy_from_user_inatomic + +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2f9b39b274a..b89c34c4019b 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a00ad8f2a657..ea7074784cc4 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -609,7 +609,7 @@ struct uv_cpu_nmi_s { DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); -#define uv_hub_nmi (uv_cpu_nmi.hub) +#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub) #define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu)) #define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub) diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 8021bd28c0f1..deabaf9759b6 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -22,11 +22,12 @@ struct vdso_image { long sym_vvar_page; long sym_hpet_page; + long sym_pvclock_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; - long sym_VDSO32_SYSENTER_RETURN; + long sym_int80_landing_pad; }; #ifdef CONFIG_X86_64 @@ -38,13 +39,7 @@ extern const struct vdso_image vdso_image_x32; #endif #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_int80; -#ifdef CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_syscall; -#endif -extern const struct vdso_image vdso_image_32_sysenter; - -extern const struct vdso_image *selected_vdso32; +extern const struct vdso_image vdso_image_32; #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 448b7ca61aee..14c63c7e8337 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,8 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 +#define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -167,6 +168,8 @@ enum vmcs_field { VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + TSC_MULTIPLIER = 0x00002032, + TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -416,6 +419,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 48d34d28f5a6..1ae89a2721d6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include <asm/pgtable_types.h> #include <asm/bootparam.h> struct mpc_bus; @@ -83,13 +82,11 @@ struct x86_init_paging { * struct x86_init_timers - platform specific timer setup * @setup_perpcu_clockev: set up the per cpu clock event device for the * boot cpu - * @tsc_pre_init: platform function called before TSC init * @timer_init: initialize the platform timer (default PIT/HPET) * @wallclock_init: init the wallclock device */ struct x86_init_timers { void (*setup_percpu_clockev)(void); - void (*tsc_pre_init)(void); void (*timer_init)(void); void (*wallclock_init)(void); }; diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 4c20dd333412..3bcdcc84259d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op) } static inline int -HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) +HYPERVISOR_platform_op(struct xen_platform_op *op) { - platform_op->interface_version = XENPF_INTERFACE_VERSION; - return _hypercall1(int, dom0_op, platform_op); + op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, op); } static inline int diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d866959e5685..8b2d4bea9962 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void) } #endif +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num); +void xen_arch_unregister_cpu(int num); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 0679e11d2cf7..f5fb840b43e8 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,7 +12,7 @@ #include <asm/pgtable.h> #include <xen/interface/xen.h> -#include <xen/grant_table.h> +#include <xen/interface/grant_table.h> #include <xen/features.h> /* Xen machine address */ @@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr; extern unsigned long xen_p2m_size; extern unsigned long xen_max_p2m_pfn; +extern int xen_alloc_p2m_entry(unsigned long pfn); + extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); @@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { return false; } diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 5a08bc8bff33..c54beb44c4c1 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -553,7 +553,7 @@ do { \ if (cpu_has_xmm) { \ xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_sse_pf64); \ - } else if (cpu_has_mmx) { \ + } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ xor_speed(&xor_block_pII_mmx); \ xor_speed(&xor_block_p5_mmx); \ } else { \ diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f0412c50c47b..7956412d09bd 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -153,6 +153,12 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 +/* MSR used to reset the guest OS. */ +#define HV_X64_MSR_RESET 0x40000003 + +/* MSR used to provide vcpu runtime in 100ns units */ +#define HV_X64_MSR_VP_RUNTIME 0x40000010 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 @@ -251,4 +257,108 @@ typedef struct _HV_REFERENCE_TSC_PAGE { __s64 tsc_offset; } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + +#define HV_SYNIC_STIMER_COUNT (4) + +/* Define synthetic interrupt controller message constants. */ +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define hypervisor message types. */ +enum hv_message_type { + HVMSG_NONE = 0x00000000, + + /* Memory access messages. */ + HVMSG_UNMAPPED_GPA = 0x80000000, + HVMSG_GPA_INTERCEPT = 0x80000001, + + /* Timer notification messages. */ + HVMSG_TIMER_EXPIRED = 0x80000010, + + /* Error messages. */ + HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, + HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + + /* Trace buffer complete messages. */ + HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, + + /* Platform-specific processor intercept messages. */ + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, + HVMSG_X64_MSR_INTERCEPT = 0x80010001, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 +}; + +/* Define synthetic interrupt controller message flags. */ +union hv_message_flags { + __u8 asu8; + struct { + __u8 msg_pending:1; + __u8 reserved:7; + }; +}; + +/* Define port identifier type. */ +union hv_port_id { + __u32 asu32; + struct { + __u32 id:24; + __u32 reserved:8; + } u; +}; + +/* Define synthetic interrupt controller message header. */ +struct hv_message_header { + __u32 message_type; + __u8 payload_size; + union hv_message_flags message_flags; + __u8 reserved[2]; + union { + __u64 sender; + union hv_port_id port; + }; +}; + +/* Define synthetic interrupt controller message format. */ +struct hv_message { + struct hv_message_header header; + union { + __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +}; + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +}; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { + __u32 timer_index; + __u32 reserved; + __u64 expiration_time; /* When the timer expired */ + __u64 delivery_time; /* When the message was delivered */ +}; + +#define HV_STIMER_ENABLE (1ULL << 0) +#define HV_STIMER_PERIODIC (1ULL << 1) +#define HV_STIMER_LAZY (1ULL << 2) +#define HV_STIMER_AUTOENABLE (1ULL << 3) +#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) + #endif diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 76880ede9a35..2184943341bf 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -2,7 +2,7 @@ #define _UAPI_ASM_X86_MCE_H #include <linux/types.h> -#include <asm/ioctls.h> +#include <linux/ioctl.h> /* Fields are zero when not available */ struct mce { @@ -16,7 +16,7 @@ struct mce { __u8 cpuvendor; /* cpu vendor as encoded in system.h */ __u8 inject_flags; /* software inject flags */ __u8 severity; - __u8 usable_addr; + __u8 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 40836a9a7250..d485232f1e9f 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -1,221 +1,360 @@ #ifndef _UAPI_ASM_X86_SIGCONTEXT_H #define _UAPI_ASM_X86_SIGCONTEXT_H +/* + * Linux signal context definitions. The sigcontext includes a complex + * hierarchy of CPU and FPU state, available to user-space (on the stack) when + * a signal handler is executed. + * + * As over the years this ABI grew from its very simple roots towards + * supporting more and more CPU state organically, some of the details (which + * were rather clever hacks back in the days) became a bit quirky by today. + * + * The current ABI includes flexible provisions for future extensions, so we + * won't have to grow new quirks for quite some time. Promise! + */ + #include <linux/compiler.h> #include <linux/types.h> -#define FP_XSTATE_MAGIC1 0x46505853U -#define FP_XSTATE_MAGIC2 0x46505845U -#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) /* - * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame - * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes - * are used to extended the fpstate pointer in the sigcontext, which now - * includes the extended state information along with fpstate information. + * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame + * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are + * used to extend the fpstate pointer in the sigcontext, which now includes the + * extended state information along with fpstate information. + * + * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a + * sw_reserved.extended_size bytes large extended context area present. (The + * last 32-bit word of this extended area (at the + * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to + * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.) * - * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved - * area and FP_XSTATE_MAGIC2 at the end of memory layout - * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the - * extended state information in the memory layout pointed by the fpstate - * pointer in sigcontext. + * This extended area typically grows with newer CPUs that have larger and + * larger XSAVE areas. */ struct _fpx_sw_bytes { - __u32 magic1; /* FP_XSTATE_MAGIC1 */ - __u32 extended_size; /* total size of the layout referred by - * fpstate pointer in the sigcontext. - */ - __u64 xfeatures; - /* feature bit mask (including fp/sse/extended - * state) that is present in the memory - * layout. - */ - __u32 xstate_size; /* actual xsave state size, based on the - * features saved in the layout. - * 'extended_size' will be greater than - * 'xstate_size'. - */ - __u32 padding[7]; /* for future use. */ + /* + * If set to FP_XSTATE_MAGIC1 then this is an xstate context. + * 0 if a legacy frame. + */ + __u32 magic1; + + /* + * Total size of the fpstate area: + * + * - if magic1 == 0 then it's sizeof(struct _fpstate) + * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate) + * plus extensions (if any) + */ + __u32 extended_size; + + /* + * Feature bit mask (including FP/SSE/extended state) that is present + * in the memory layout: + */ + __u64 xfeatures; + + /* + * Actual XSAVE state size, based on the xfeatures saved in the layout. + * 'extended_size' is greater than 'xstate_size': + */ + __u32 xstate_size; + + /* For future use: */ + __u32 padding[7]; }; -#ifdef __i386__ /* - * As documented in the iBCS2 standard.. - * - * The first part of "struct _fpstate" is just the normal i387 - * hardware setup, the extra "status" word is used to save the - * coprocessor status word before entering the handler. + * As documented in the iBCS2 standard: * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 + * The first part of "struct _fpstate" is just the normal i387 hardware setup, + * the extra "status" word is used to save the coprocessor status word before + * entering the handler. * - * The FPU state data structure has had to grow to accommodate the - * extended FPU state required by the Streaming SIMD Extensions. - * There is no documented standard to accomplish this at the moment. + * The FPU state data structure has had to grow to accommodate the extended FPU + * state required by the Streaming SIMD Extensions. There is no documented + * standard to accomplish this at the moment. */ + +/* 10-byte legacy floating point register: */ struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; + __u16 significand[4]; + __u16 exponent; }; +/* 16-byte floating point register: */ struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; + __u16 significand[4]; + __u16 exponent; + __u16 padding[3]; }; +/* 16-byte XMM register: */ struct _xmmreg { - unsigned long element[4]; + __u32 element[4]; }; -struct _fpstate { - /* Regular FPU environment */ - unsigned long cw; - unsigned long sw; - unsigned long tag; - unsigned long ipoff; - unsigned long cssel; - unsigned long dataoff; - unsigned long datasel; - struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff = regular FPU data only */ +#define X86_FXSR_MAGIC 0x0000 + +/* + * The 32-bit FPU frame: + */ +struct _fpstate_32 { + /* Legacy FPU environment: */ + __u32 cw; + __u32 sw; + __u32 tag; + __u32 ipoff; + __u32 cssel; + __u32 dataoff; + __u32 datasel; + struct _fpreg _st[8]; + __u16 status; + __u16 magic; /* 0xffff: regular FPU data only */ + /* 0x0000: FXSR FPU data */ /* FXSR FPU environment */ - unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ - unsigned long mxcsr; - unsigned long reserved; - struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ - struct _xmmreg _xmm[8]; - unsigned long padding1[44]; + __u32 _fxsr_env[6]; /* FXSR FPU env is ignored */ + __u32 mxcsr; + __u32 reserved; + struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ + struct _xmmreg _xmm[8]; /* First 8 XMM registers */ + union { + __u32 padding1[44]; /* Second 8 XMM registers plus padding */ + __u32 padding[44]; /* Alias name for old user-space */ + }; union { - unsigned long padding2[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state info */ + __u32 padding2[12]; + struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */ }; }; -#define X86_FXSR_MAGIC 0x0000 - -#ifndef __KERNEL__ /* - * User-space might still rely on the old definition: + * The 64-bit FPU frame. (FXSAVE format and later) + * + * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is + * larger: 'struct _xstate'. Note that 'struct _xstate' embedds + * 'struct _fpstate' so that you can always assume the _fpstate portion + * exists so that you can check the magic value. + * + * Note2: Reserved fields may someday contain valuable data. Always + * save/restore them when you change signal frames. */ -struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long edi; - unsigned long esi; - unsigned long ebp; - unsigned long esp; - unsigned long ebx; - unsigned long edx; - unsigned long ecx; - unsigned long eax; - unsigned long trapno; - unsigned long err; - unsigned long eip; - unsigned short cs, __csh; - unsigned long eflags; - unsigned long esp_at_signal; - unsigned short ss, __ssh; - struct _fpstate __user *fpstate; - unsigned long oldmask; - unsigned long cr2; -}; -#endif /* !__KERNEL__ */ - -#else /* __i386__ */ - -/* FXSAVE frame */ -/* Note: reserved1/2 may someday contain valuable data. Always save/restore - them when you change signal frames. */ -struct _fpstate { - __u16 cwd; - __u16 swd; - __u16 twd; /* Note this is not the same as the - 32bit/x87/FSAVE twd */ - __u16 fop; - __u64 rip; - __u64 rdp; - __u32 mxcsr; - __u32 mxcsr_mask; - __u32 st_space[32]; /* 8*16 bytes for each FP-reg */ - __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */ - __u32 reserved2[12]; +struct _fpstate_64 { + __u16 cwd; + __u16 swd; + /* Note this is not the same as the 32-bit/x87/FSAVE twd: */ + __u16 twd; + __u16 fop; + __u64 rip; + __u64 rdp; + __u32 mxcsr; + __u32 mxcsr_mask; + __u32 st_space[32]; /* 8x FP registers, 16 bytes each */ + __u32 xmm_space[64]; /* 16x XMM registers, 16 bytes each */ + __u32 reserved2[12]; union { - __u32 reserved3[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state information */ + __u32 reserved3[12]; + struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */ }; }; -#ifndef __KERNEL__ -/* - * User-space might still rely on the old definition: - */ -struct sigcontext { - __u64 r8; - __u64 r9; - __u64 r10; - __u64 r11; - __u64 r12; - __u64 r13; - __u64 r14; - __u64 r15; - __u64 rdi; - __u64 rsi; - __u64 rbp; - __u64 rbx; - __u64 rdx; - __u64 rax; - __u64 rcx; - __u64 rsp; - __u64 rip; - __u64 eflags; /* RFLAGS */ - __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; - __u64 err; - __u64 trapno; - __u64 oldmask; - __u64 cr2; - struct _fpstate __user *fpstate; /* zero when no FPU context */ -#ifdef __ILP32__ - __u32 __fpstate_pad; +#ifdef __i386__ +# define _fpstate _fpstate_32 +#else +# define _fpstate _fpstate_64 #endif - __u64 reserved1[8]; -}; -#endif /* !__KERNEL__ */ - -#endif /* !__i386__ */ struct _header { - __u64 xfeatures; - __u64 reserved1[2]; - __u64 reserved2[5]; + __u64 xfeatures; + __u64 reserved1[2]; + __u64 reserved2[5]; }; struct _ymmh_state { - /* 16 * 16 bytes for each YMMH-reg */ - __u32 ymmh_space[64]; + /* 16x YMM registers, 16 bytes each: */ + __u32 ymmh_space[64]; }; /* - * Extended state pointed by the fpstate pointer in the sigcontext. - * In addition to the fpstate, information encoded in the xstate_hdr - * indicates the presence of other extended state information - * supported by the processor and OS. + * Extended state pointed to by sigcontext::fpstate. + * + * In addition to the fpstate, information encoded in _xstate::xstate_hdr + * indicates the presence of other extended state information supported + * by the CPU and kernel: */ struct _xstate { - struct _fpstate fpstate; - struct _header xstate_hdr; - struct _ymmh_state ymmh; - /* new processor state extensions go here */ + struct _fpstate fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; + +/* + * The 32-bit signal frame: + */ +struct sigcontext_32 { + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 di; + __u32 si; + __u32 bp; + __u32 sp; + __u32 bx; + __u32 dx; + __u32 cx; + __u32 ax; + __u32 trapno; + __u32 err; + __u32 ip; + __u16 cs, __csh; + __u32 flags; + __u32 sp_at_signal; + __u16 ss, __ssh; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the definition of + * (struct _fpx_sw_bytes) + */ + __u32 fpstate; /* Zero when no FPU/extended context */ + __u32 oldmask; + __u32 cr2; +}; + +/* + * The 64-bit signal frame: + */ +struct sigcontext_64 { + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 di; + __u64 si; + __u64 bp; + __u64 bx; + __u64 dx; + __u64 ax; + __u64 cx; + __u64 sp; + __u64 ip; + __u64 flags; + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the definition of + * (struct _fpx_sw_bytes) + */ + __u64 fpstate; /* Zero when no FPU/extended context */ + __u64 reserved1[8]; +}; + +/* + * Create the real 'struct sigcontext' type: + */ +#ifdef __KERNEL__ +# ifdef __i386__ +# define sigcontext sigcontext_32 +# else +# define sigcontext sigcontext_64 +# endif +#endif + +/* + * The old user-space sigcontext definition, just in case user-space still + * relies on it. The kernel definition (in asm/sigcontext.h) has unified + * field names but otherwise the same layout. + */ +#ifndef __KERNEL__ + +#define _fpstate_ia32 _fpstate_32 +#define sigcontext_ia32 sigcontext_32 + + +# ifdef __i386__ +struct sigcontext { + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 edi; + __u32 esi; + __u32 ebp; + __u32 esp; + __u32 ebx; + __u32 edx; + __u32 ecx; + __u32 eax; + __u32 trapno; + __u32 err; + __u32 eip; + __u16 cs, __csh; + __u32 eflags; + __u32 esp_at_signal; + __u16 ss, __ssh; + struct _fpstate __user *fpstate; + __u32 oldmask; + __u32 cr2; }; +# else /* __x86_64__: */ +struct sigcontext { + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 rdi; + __u64 rsi; + __u64 rbp; + __u64 rbx; + __u64 rdx; + __u64 rax; + __u64 rcx; + __u64 rsp; + __u64 rip; + __u64 eflags; /* RFLAGS */ + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; + struct _fpstate __user *fpstate; /* Zero when no FPU context */ +# ifdef __ILP32__ + __u32 __fpstate_pad; +# endif + __u64 reserved1[8]; +}; +# endif /* __x86_64__ */ +#endif /* !__KERNEL__ */ #endif /* _UAPI_ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index ad1478c4ae12..a92b0f0dc09e 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,77 +1,8 @@ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H -#include <linux/types.h> +/* This is a legacy file - all the type definitions are in sigcontext.h: */ -/* signal context for 32bit programs. */ - -#define X86_FXSR_MAGIC 0x0000 - -struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; -}; - -struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; -}; - -struct _xmmreg { - __u32 element[4]; -}; - -/* FSAVE frame with extensions */ -struct _fpstate_ia32 { - /* Regular FPU environment */ - __u32 cw; - __u32 sw; - __u32 tag; /* not compatible to 64bit twd */ - __u32 ipoff; - __u32 cssel; - __u32 dataoff; - __u32 datasel; - struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff = regular FPU data only */ - - /* FXSR FPU environment */ - __u32 _fxsr_env[6]; - __u32 mxcsr; - __u32 reserved; - struct _fpxreg _fxsr_st[8]; - struct _xmmreg _xmm[8]; /* It's actually 16 */ - __u32 padding[44]; - union { - __u32 padding2[12]; - struct _fpx_sw_bytes sw_reserved; - }; -}; - -struct sigcontext_ia32 { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned int di; - unsigned int si; - unsigned int bp; - unsigned int sp; - unsigned int bx; - unsigned int dx; - unsigned int cx; - unsigned int ax; - unsigned int trapno; - unsigned int err; - unsigned int ip; - unsigned short cs, __csh; - unsigned int flags; - unsigned int sp_at_signal; - unsigned short ss, __ssh; - unsigned int fpstate; /* really (struct _fpstate_ia32 *) */ - unsigned int oldmask; - unsigned int cr2; -}; +#include <asm/sigcontext.h> #endif /* _ASM_X86_SIGCONTEXT32_H */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b5d7640abc5d..8a4add8e4639 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -100,6 +100,7 @@ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..5b15d94a33f8 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,6 +78,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -126,7 +127,8 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_PCOMMIT, "PCOMMIT" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ded848c20e05..e75907601a41 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void) { int count; int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; if (!cpu_has_apic) return -ENODEV; @@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + x2count = madt_proc[0].count; + count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24e94ce454e2..8a5cddac7d44 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map; static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; /* + * This variable controls which CPUs receive external NMIs. By default, + * external NMIs are delivered only to the BSP. + */ +static int apic_extnmi = APIC_EXTNMI_BSP; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void) value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; apic_write(APIC_LVT1, value); } @@ -1378,9 +1386,11 @@ void setup_local_APIC(void) apic_write(APIC_LVT0, value); /* - * only the BP should see the LINT1 NMI signal, obviously. + * Only the BSP sees the LINT1 NMI signal by default. This can be + * modified by apic_extnmi= boot option. */ - if (!cpu) + if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) || + apic_extnmi == APIC_EXTNMI_ALL) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; @@ -1431,7 +1441,7 @@ enum { }; static int x2apic_state; -static inline void __x2apic_disable(void) +static void __x2apic_disable(void) { u64 msr; @@ -1447,7 +1457,7 @@ static inline void __x2apic_disable(void) printk_once(KERN_INFO "x2apic disabled\n"); } -static inline void __x2apic_enable(void) +static void __x2apic_enable(void) { u64 msr; @@ -1807,7 +1817,7 @@ int apic_version[MAX_LOCAL_APIC]; /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static inline void __smp_spurious_interrupt(u8 vector) +static void __smp_spurious_interrupt(u8 vector) { u32 v; @@ -1848,7 +1858,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static inline void __smp_error_interrupt(struct pt_regs *regs) +static void __smp_error_interrupt(struct pt_regs *regs) { u32 v; u32 i = 0; @@ -2270,6 +2280,7 @@ static struct { unsigned int apic_tmict; unsigned int apic_tdcr; unsigned int apic_thmr; + unsigned int apic_cmci; } apic_pm_state; static int lapic_suspend(void) @@ -2299,6 +2310,10 @@ static int lapic_suspend(void) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI); +#endif local_irq_save(flags); disable_local_APIC(); @@ -2355,10 +2370,14 @@ static void lapic_resume(void) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci); +#endif if (maxlvt >= 4) apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); @@ -2548,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg) return 0; } early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); + +static int __init apic_set_extnmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strncmp("all", arg, 3)) + apic_extnmi = APIC_EXTNMI_ALL; + else if (!strncmp("none", arg, 4)) + apic_extnmi = APIC_EXTNMI_NONE; + else if (!strncmp("bsp", arg, 3)) + apic_extnmi = APIC_EXTNMI_BSP; + else { + pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic_extnmi", apic_set_extnmi); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f92ab36979a2..9968f30cca3e 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,6 +185,7 @@ static struct apic apic_flat = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_allbutself = flat_send_IPI_allbutself, @@ -230,17 +231,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - default_send_IPI_mask_sequence_phys(cpumask, vector); -} - -static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, - int vector) -{ - default_send_IPI_mask_allbutself_phys(cpumask, vector); -} - static void physflat_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -248,7 +238,7 @@ static void physflat_send_IPI_allbutself(int vector) static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int physflat_probe(void) @@ -292,8 +282,9 @@ static struct apic apic_physflat = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = physflat_send_IPI_mask, - .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0d96749cfcac..331a7a07c48f 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -30,6 +30,7 @@ #include <asm/e820.h> static void noop_init_apic_ldr(void) { } +static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } @@ -144,6 +145,7 @@ struct apic apic_noop = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, .send_IPI_allbutself = noop_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index b548fd3b764b..c80c02c6ec49 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -11,30 +11,21 @@ * */ -#include <linux/errno.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/ctype.h> #include <linux/init.h> -#include <linux/hardirq.h> -#include <linux/delay.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> -#include <asm/smp.h> -#include <asm/apic.h> #include <asm/ipi.h> #include <asm/apic_flat_64.h> #include <asm/pgtable.h> +#include <asm/pci_x86.h> -static int numachip_system __read_mostly; +u8 numachip_system __read_mostly; +static const struct apic apic_numachip1; +static const struct apic apic_numachip2; +static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; -static const struct apic apic_numachip; - -static unsigned int get_apic_id(unsigned long x) +static unsigned int numachip1_get_apic_id(unsigned long x) { unsigned long value; unsigned int id = (x >> 24) & 0xff; @@ -47,7 +38,7 @@ static unsigned int get_apic_id(unsigned long x) return id; } -static unsigned long set_apic_id(unsigned int id) +static unsigned long numachip1_set_apic_id(unsigned int id) { unsigned long x; @@ -55,9 +46,17 @@ static unsigned long set_apic_id(unsigned int id) return x; } -static unsigned int read_xapic_id(void) +static unsigned int numachip2_get_apic_id(unsigned long x) +{ + u64 mcfg; + + rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); +} + +static unsigned long numachip2_set_apic_id(unsigned int id) { - return get_apic_id(apic_read(APIC_ID)); + return id << 24; } static int numachip_apic_id_valid(int apicid) @@ -68,7 +67,7 @@ static int numachip_apic_id_valid(int apicid) static int numachip_apic_id_registered(void) { - return physid_isset(read_xapic_id(), phys_cpu_present_map); + return 1; } static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) @@ -76,36 +75,48 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static void numachip1_apic_icr_write(int apicid, unsigned int val) { - union numachip_csr_g3_ext_irq_gen int_gen; - - int_gen.s._destination_apic_id = phys_apicid; - int_gen.s._vector = 0; - int_gen.s._msgtype = APIC_DM_INIT >> 8; - int_gen.s._index = 0; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); +} - int_gen.s._msgtype = APIC_DM_STARTUP >> 8; - int_gen.s._vector = start_rip >> 12; +static void numachip2_apic_icr_write(int apicid, unsigned int val) +{ + numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); +} - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); + numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | + (start_rip >> 12)); return 0; } static void numachip_send_IPI_one(int cpu, int vector) { - union numachip_csr_g3_ext_irq_gen int_gen; - int apicid = per_cpu(x86_cpu_to_apicid, cpu); - - int_gen.s._destination_apic_id = apicid; - int_gen.s._vector = vector; - int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; - int_gen.s._index = 0; + int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned int dmode; + + preempt_disable(); + local_apicid = __this_cpu_read(x86_cpu_to_apicid); + + /* Send via local APIC where non-local part matches */ + if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) { + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(apicid, vector, + APIC_DEST_PHYSICAL); + local_irq_restore(flags); + preempt_enable(); + return; + } + preempt_enable(); - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED; + numachip_apic_icr_write(apicid, dmode | vector); } static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) @@ -149,9 +160,14 @@ static void numachip_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -static int __init numachip_probe(void) +static int __init numachip1_probe(void) { - return apic == &apic_numachip; + return apic == &apic_numachip1; +} + +static int __init numachip2_probe(void) +{ + return apic == &apic_numachip2; } static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) @@ -172,11 +188,19 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) static int __init numachip_system_init(void) { - if (!numachip_system) + /* Map the LCSR area and set up the apic_icr_write function */ + switch (numachip_system) { + case 1: + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + numachip_apic_icr_write = numachip1_apic_icr_write; + break; + case 2: + init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); + numachip_apic_icr_write = numachip2_apic_icr_write; + break; + default: return 0; - - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); + } x86_cpuinit.fixup_cpu_id = fixup_cpu_id; x86_init.pci.arch_init = pci_numachip_init; @@ -185,21 +209,95 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strncmp(oem_id, "NUMASC", 6)) { - numachip_system = 1; - return 1; - } + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONNECT", 8) != 0)) + return 0; - return 0; + numachip_system = 1; + + return 1; +} + +static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONECT2", 8) != 0)) + return 0; + + numachip_system = 2; + + return 1; +} + +/* APIC IPIs are queued */ +static void numachip_apic_wait_icr_idle(void) +{ } -static const struct apic apic_numachip __refconst = { +/* APIC NMI IPIs are queued */ +static u32 numachip_safe_apic_wait_icr_idle(void) +{ + return 0; +} +static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", - .probe = numachip_probe, - .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .probe = numachip1_probe, + .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = numachip_phys_pkg_id, + + .get_apic_id = numachip1_get_apic_id, + .set_apic_id = numachip1_set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI = numachip_send_IPI_one, + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, +}; + +apic_driver(apic_numachip1); + +static const struct apic apic_numachip2 __refconst = { + .name = "NumaConnect2 system", + .probe = numachip2_probe, + .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, @@ -221,12 +319,13 @@ static const struct apic apic_numachip __refconst = { .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, - .get_apic_id = get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = numachip2_get_apic_id, + .set_apic_id = numachip2_set_apic_id, .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, .send_IPI_allbutself = numachip_send_IPI_allbutself, @@ -241,8 +340,8 @@ static const struct apic apic_numachip __refconst = { .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; -apic_driver(apic_numachip); +apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 971cf8875939..cf9bd896c12d 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_phys(mask, vector); -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector) static void bigsmp_send_IPI_all(int vector) { - bigsmp_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -180,7 +175,8 @@ static struct apic apic_bigsmp = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = bigsmp_send_IPI_mask, + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = NULL, .send_IPI_allbutself = bigsmp_send_IPI_allbutself, .send_IPI_all = bigsmp_send_IPI_all, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4f2821527014..fdb0fbfb1197 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -529,7 +529,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_pin(int vector, struct mp_chip_data *data) +static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { unsigned long flags; struct irq_pin_list *entry; @@ -2521,6 +2521,7 @@ void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; const struct cpumask *mask; + struct irq_desc *desc; struct irq_data *idata; struct irq_chip *chip; @@ -2536,7 +2537,9 @@ void __init setup_ioapic_dest(void) if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) continue; - idata = irq_get_irq_data(irq); + desc = irq_to_desc(irq); + raw_spin_lock_irq(&desc->lock); + idata = irq_desc_get_irq_data(desc); /* * Honour affinities which have been set in early boot @@ -2550,6 +2553,7 @@ void __init setup_ioapic_dest(void) /* Might be lapic_chip for irq 0 */ if (chip->irq_set_affinity) chip->irq_set_affinity(idata, mask, false); + raw_spin_unlock_irq(&desc->lock); } } #endif diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 62071569bd50..eb45fc9b6124 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,16 @@ #include <asm/proto.h> #include <asm/ipi.h> +void default_send_IPI_single_phys(int cpu, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { unsigned long query_cpu; @@ -55,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +/* + * Helper function for APICs which insist on cpumasks + */ +void default_send_IPI_single(int cpu, int vector) +{ + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5f1feb6854af..ade25320df96 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, return arg->msi_hwirq; } -static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, - int nvec, msi_alloc_info_t *arg) +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) { struct pci_dev *pdev = to_pci_dev(dev); struct msi_desc *desc = first_pci_msi_entry(pdev); @@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, return 0; } +EXPORT_SYMBOL_GPL(pci_msi_prepare); -static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); } +EXPORT_SYMBOL_GPL(pci_msi_set_desc); static struct msi_domain_ops pci_msi_domain_ops = { .get_hwirq = pci_msi_get_hwirq, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7694ae6c1199..f316e34abb42 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,6 +105,7 @@ static struct apic apic_default = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, .send_IPI_allbutself = default_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 836d11b92811..3b670df4ba7b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -29,8 +29,9 @@ struct apic_chip_data { }; struct irq_domain *x86_vector_domain; +EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); -static cpumask_var_t vector_cpumask; +static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; @@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) return data ? &data->cfg : NULL; } +EXPORT_SYMBOL_GPL(irqd_cfg); struct irq_cfg *irq_cfg(unsigned int irq) { @@ -116,35 +118,47 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; - int cpu, err; + int cpu, vector; - if (d->move_in_progress) + /* + * If there is still a move in progress or the previous move has not + * been cleaned up completely, tell the caller to come back later. + */ + if (d->move_in_progress || + cpumask_intersects(d->old_domain, cpu_online_mask)) return -EBUSY; /* Only try and allocate irqs on cpus that are present */ - err = -ENOSPC; cpumask_clear(d->old_domain); + cpumask_clear(searched_cpumask); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { - int new_cpu, vector, offset; + int new_cpu, offset; + /* Get the possible target cpus for @mask/@cpu from the apic */ apic->vector_allocation_domain(cpu, vector_cpumask, mask); + /* + * Clear the offline cpus from @vector_cpumask for searching + * and verify whether the result overlaps with @mask. If true, + * then the call to apic->cpu_mask_to_apicid_and() will + * succeed as well. If not, no point in trying to find a + * vector in this mask. + */ + cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask); + if (!cpumask_intersects(vector_searchmask, mask)) + goto next_cpu; + if (cpumask_subset(vector_cpumask, d->domain)) { - err = 0; if (cpumask_equal(vector_cpumask, d->domain)) - break; + goto success; /* - * New cpumask using the vector is a proper subset of - * the current in use mask. So cleanup the vector - * allocation for the members that are not used anymore. + * Mark the cpus which are not longer in the mask for + * cleanup. */ - cpumask_andnot(d->old_domain, d->domain, - vector_cpumask); - d->move_in_progress = - cpumask_intersects(d->old_domain, cpu_online_mask); - cpumask_and(d->domain, d->domain, vector_cpumask); - break; + cpumask_andnot(d->old_domain, d->domain, vector_cpumask); + vector = d->cfg.vector; + goto update; } vector = current_vector; @@ -156,45 +170,60 @@ next: vector = FIRST_EXTERNAL_VECTOR + offset; } - if (unlikely(current_vector == vector)) { - cpumask_or(d->old_domain, d->old_domain, - vector_cpumask); - cpumask_andnot(vector_cpumask, mask, d->old_domain); - cpu = cpumask_first_and(vector_cpumask, - cpu_online_mask); - continue; - } + /* If the search wrapped around, try the next cpu */ + if (unlikely(current_vector == vector)) + goto next_cpu; if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { + for_each_cpu(new_cpu, vector_searchmask) { if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) goto next; } /* Found one! */ current_vector = vector; current_offset = offset; - if (d->cfg.vector) { + /* Schedule the old vector for cleanup on all cpus */ + if (d->cfg.vector) cpumask_copy(d->old_domain, d->domain); - d->move_in_progress = - cpumask_intersects(d->old_domain, cpu_online_mask); - } - for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) + for_each_cpu(new_cpu, vector_searchmask) per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); - d->cfg.vector = vector; - cpumask_copy(d->domain, vector_cpumask); - err = 0; - break; - } + goto update; - if (!err) { - /* cache destination APIC IDs into cfg->dest_apicid */ - err = apic->cpu_mask_to_apicid_and(mask, d->domain, - &d->cfg.dest_apicid); +next_cpu: + /* + * We exclude the current @vector_cpumask from the requested + * @mask and try again with the next online cpu in the + * result. We cannot modify @mask, so we use @vector_cpumask + * as a temporary buffer here as it will be reassigned when + * calling apic->vector_allocation_domain() above. + */ + cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask); + cpumask_andnot(vector_cpumask, mask, searched_cpumask); + cpu = cpumask_first_and(vector_cpumask, cpu_online_mask); + continue; } + return -ENOSPC; - return err; +update: + /* + * Exclude offline cpus from the cleanup mask and set the + * move_in_progress flag when the result is not empty. + */ + cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); + d->move_in_progress = !cpumask_empty(d->old_domain); + d->cfg.vector = vector; + cpumask_copy(d->domain, vector_cpumask); +success: + /* + * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail + * as we already established, that mask & d->domain & cpu_online_mask + * is not empty. + */ + BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid)); + return 0; } static int assign_irq_vector(int irq, struct apic_chip_data *data, @@ -224,10 +253,8 @@ static int assign_irq_vector_policy(int irq, int node, static void clear_irq_vector(int irq, struct apic_chip_data *data) { struct irq_desc *desc; - unsigned long flags; int cpu, vector; - raw_spin_lock_irqsave(&vector_lock, flags); BUG_ON(!data->cfg.vector); vector = data->cfg.vector; @@ -237,10 +264,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) data->cfg.vector = 0; cpumask_clear(data->domain); - if (likely(!data->move_in_progress)) { - raw_spin_unlock_irqrestore(&vector_lock, flags); + /* + * If move is in progress or the old_domain mask is not empty, + * i.e. the cleanup IPI has not been processed yet, we need to remove + * the old references to desc from all cpus vector tables. + */ + if (!data->move_in_progress && cpumask_empty(data->old_domain)) return; - } desc = irq_to_desc(irq); for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { @@ -253,7 +283,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) } } data->move_in_progress = 0; - raw_spin_unlock_irqrestore(&vector_lock, flags); } void init_irq_alloc_info(struct irq_alloc_info *info, @@ -274,19 +303,24 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { + struct apic_chip_data *apic_data; struct irq_data *irq_data; + unsigned long flags; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { + raw_spin_lock_irqsave(&vector_lock, flags); clear_irq_vector(virq + i, irq_data->chip_data); - free_apic_chip_data(irq_data->chip_data); + apic_data = irq_data->chip_data; + irq_domain_reset_irq_data(irq_data); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_apic_chip_data(apic_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs()) legacy_irq_data[virq + i] = NULL; #endif - irq_domain_reset_irq_data(irq_data); } } } @@ -361,7 +395,11 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return nr_legacy_irqs(); + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); } #ifdef CONFIG_X86_IO_APIC @@ -400,6 +438,8 @@ int __init arch_early_irq_init(void) arch_init_htirq_domain(x86_vector_domain); BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL)); return arch_early_ioapic_init(); } @@ -488,14 +528,7 @@ static int apic_set_affinity(struct irq_data *irq_data, return -EINVAL; err = assign_irq_vector(irq, data, dest); - if (err) { - if (assign_irq_vector(irq, data, - irq_data_get_affinity_mask(irq_data))) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - return IRQ_SET_MASK_OK; + return err ? err : IRQ_SET_MASK_OK; } static struct irq_chip lapic_controller = { @@ -507,20 +540,12 @@ static struct irq_chip lapic_controller = { #ifdef CONFIG_SMP static void __send_cleanup_vector(struct apic_chip_data *data) { - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - - for_each_cpu_and(i, data->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), - IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } + raw_spin_lock(&vector_lock); + cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); data->move_in_progress = 0; + if (!cpumask_empty(data->old_domain)) + apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR); + raw_spin_unlock(&vector_lock); } void send_cleanup_vector(struct irq_cfg *cfg) @@ -564,12 +589,25 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) goto unlock; /* - * Check if the irq migration is in progress. If so, we - * haven't received the cleanup request yet for this irq. + * Nothing to cleanup if irq migration is in progress + * or this cpu is not set in the cleanup mask. */ - if (data->move_in_progress) + if (data->move_in_progress || + !cpumask_test_cpu(me, data->old_domain)) goto unlock; + /* + * We have two cases to handle here: + * 1) vector is unchanged but the target mask got reduced + * 2) vector and the target mask has changed + * + * #1 is obvious, but in #2 we have two vectors with the same + * irq descriptor: the old and the new vector. So we need to + * make sure that we only cleanup the old vector. The new + * vector has the current @vector number in the config and + * this cpu is part of the target mask. We better leave that + * one alone. + */ if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) goto unlock; @@ -587,6 +625,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) goto unlock; } __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + cpumask_clear_cpu(me, data->old_domain); unlock: raw_spin_unlock(&desc->lock); } @@ -615,12 +654,48 @@ void irq_complete_move(struct irq_cfg *cfg) __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); } -void irq_force_complete_move(int irq) +/* + * Called with @desc->lock held and interrupts disabled. + */ +void irq_force_complete_move(struct irq_desc *desc) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_data *irqdata = irq_desc_get_irq_data(desc); + struct apic_chip_data *data = apic_chip_data(irqdata); + struct irq_cfg *cfg = data ? &data->cfg : NULL; - if (cfg) - __irq_complete_move(cfg, cfg->vector); + if (!cfg) + return; + + __irq_complete_move(cfg, cfg->vector); + + /* + * This is tricky. If the cleanup of @data->old_domain has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * The cleanup cannot make progress because we hold @desc->lock. So in + * case @data->old_domain is not yet cleaned up, we need to drop the + * lock and acquire it again. @desc cannot go away, because the + * hotplug code holds the sparse irq lock. + */ + raw_spin_lock(&vector_lock); + /* Clean out all offline cpus (including ourself) first. */ + cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); + while (!cpumask_empty(data->old_domain)) { + raw_spin_unlock(&vector_lock); + raw_spin_unlock(&desc->lock); + cpu_relax(); + raw_spin_lock(&desc->lock); + /* + * Reevaluate apic_chip_data. It might have been cleared after + * we dropped @desc->lock. + */ + data = apic_chip_data(irqdata); + if (!data) + return; + raw_spin_lock(&vector_lock); + } + raw_spin_unlock(&vector_lock); } #endif diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cc8311c4d298..aca8b75c1552 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu) return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = { .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 662e9150ea6f..a1242e2c12e6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 4a139465f1d4..624db00583f4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, @@ -888,7 +889,10 @@ void __init uv_system_init(void) return; } pr_info("UV: Found %s hub\n", hub); - map_low_mmrs(); + + /* We now only need to map the MMRs on UV1 */ + if (is_uv1_hub()) + map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8e3d22a1af94..84a7524b202c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -43,18 +43,15 @@ void common(void) { #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) BLANK(); - OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); - - BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip); BLANK(); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); @@ -68,9 +65,6 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); -#ifdef CONFIG_X86_32 - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); -#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d8f42f902a0f..f2edafb5f24e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,7 +23,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4eb065c6bed2..58031303e304 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4a70fc6d400a..a07956a08936 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -304,7 +304,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); + unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); + + /* + * Fix percpu cpu_llc_id here as LLC topology is different + * for Fam17h systems. + */ + if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) + return; + + socket_id = (c->apicid >> bits) - 1; + core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; + + per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } @@ -421,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) */ int ht_nodeid = c->initial_apicid; - if (ht_nodeid >= 0 && - __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) @@ -665,9 +677,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { + if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(0xc0011021, value); + wrmsrl_safe(MSR_F15H_IC_CFG, value); } } } @@ -909,7 +921,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) void set_dr_addr_mask(unsigned long mask, int dr) { - if (!cpu_has_bpext) + if (!boot_cpu_has(X86_FEATURE_BPEXT)) return; switch (dr) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index d8fba5c15fbd..ae20be6e483c 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -43,7 +43,7 @@ static void init_c3(struct cpuinfo_x86 *c) /* store Centaur Extended Feature Flags as * word 5 of the CPU capability bit array */ - c->x86_capability[5] = cpuid_edx(0xC0000001); + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index de22ea7ff82f..37830de8f60a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { @@ -582,14 +581,9 @@ void cpu_detect(struct cpuinfo_x86 *c) u32 junk, tfms, cap0, misc; cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xf) << 4; + c->x86 = x86_family(tfms); + c->x86_model = x86_model(tfms); + c->x86_mask = x86_stepping(tfms); if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; @@ -600,50 +594,47 @@ void cpu_detect(struct cpuinfo_x86 *c) void get_cpu_cap(struct cpuinfo_x86 *c) { - u32 tfms, xlvl; - u32 ebx; + u32 eax, ebx, ecx, edx; /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; + c->x86_capability[CPUID_1_ECX] = ecx; + c->x86_capability[CPUID_1_EDX] = edx; } /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { - u32 eax, ebx, ecx, edx; - cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[9] = ebx; + c->x86_capability[CPUID_7_0_EBX] = ebx; + + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); } /* Extended state features: level 0x0000000d */ if (c->cpuid_level >= 0x0000000d) { - u32 eax, ebx, ecx, edx; - cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[10] = eax; + c->x86_capability[CPUID_D_1_EAX] = eax; } /* Additional Intel-defined flags: level 0x0000000F */ if (c->cpuid_level >= 0x0000000F) { - u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=0 */ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[11] = edx; + c->x86_capability[CPUID_F_0_EDX] = edx; + if (cpu_has(c, X86_FEATURE_CQM_LLC)) { /* will be overridden if occupancy monitoring exists */ c->x86_cache_max_rmid = ebx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[12] = edx; + c->x86_capability[CPUID_F_1_EDX] = edx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; @@ -655,21 +646,24 @@ void get_cpu_cap(struct cpuinfo_x86 *c) } /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; + eax = cpuid_eax(0x80000000); + c->extended_cpuid_level = eax; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); + if ((eax & 0xffff0000) == 0x80000000) { + if (eax >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } } if (c->extended_cpuid_level >= 0x80000008) { - u32 eax = cpuid_eax(0x80000008); + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + c->x86_capability[CPUID_8000_0008_EBX] = ebx; } #ifdef CONFIG_X86_32 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) @@ -679,6 +673,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + if (c->extended_cpuid_level >= 0x8000000a) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + init_scattered_cpuid_features(c); } @@ -1185,7 +1182,7 @@ void syscall_init(void) * They both write to the same internal register. STAR allows to * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION @@ -1443,7 +1440,9 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); - if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) + if (cpu_feature_enabled(X86_FEATURE_VME) || + cpu_has_tsc || + boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 98a13db5f4be..565648bc1a0a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x27: /* Penwell */ case 0x35: /* Cloverview */ + case 0x4a: /* Merrifield */ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: @@ -444,7 +445,8 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (cpu_has_ds) { + + if (boot_cpu_has(X86_FEATURE_DS)) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be4febc58b94..0b6c52388cf4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -157,7 +157,7 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -unsigned short num_cache_leaves; +static unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: @@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; @@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, - unsigned long index) +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) { int ret = 0; @@ -591,7 +591,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (cpu_has_topoext) + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &edx); else @@ -637,7 +637,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) void init_amd_cacheinfo(struct cpuinfo_x86 *c) { - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { num_cache_leaves = find_num_cache_leaves(c); } else if (c->extended_cpuid_level >= 0x80000006) { if (cpuid_edx(0x80000006) & 0xf000) @@ -809,7 +809,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9d014b82a124..a006f4cd792b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -114,7 +114,6 @@ static struct work_struct mce_work; static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); -static int mce_usable_address(struct mce *m); /* * CPU/chipset specific EDAC code can register a notifier call here to print @@ -475,6 +474,28 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&mce_irq_work); } +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + + /* Checks after this one are Intel-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 1; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return 0; + return 1; +} + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -484,7 +505,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, if (!mce) return NOTIFY_DONE; - if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) { + if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; memory_failure(pfn, MCE_VECTOR, 0); } @@ -522,10 +543,10 @@ static bool memory_error(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor == X86_VENDOR_AMD) { - /* - * coming soon - */ - return false; + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); } else if (c->x86_vendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes @@ -567,7 +588,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); */ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { - bool error_logged = false; + bool error_seen = false; struct mce m; int severity; int i; @@ -601,6 +622,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; + error_seen = true; + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) @@ -608,27 +631,24 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - /* - * In the cases where we don't have a valid address after all, - * do not add it into the ring buffer. - */ - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { - if (m.status & MCI_STATUS_ADDRV) { + if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (m.status & MCI_STATUS_ADDRV) m.severity = severity; - m.usable_addr = mce_usable_address(&m); - - if (!mce_gen_pool_add(&m)) - mce_schedule_work(); - } - } /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) { - error_logged = true; + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); + else if (mce_usable_address(&m)) { + /* + * Although we skipped logging this, we still want + * to take action. Add to the pool so the registered + * notifiers will see it. + */ + if (!mce_gen_pool_add(&m)) + mce_schedule_work(); } /* @@ -644,7 +664,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) sync_core(); - return error_logged; + return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -931,23 +951,6 @@ reset: return ret; } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. - */ -static int mce_usable_address(struct mce *m) -{ - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) - return 0; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; - return 1; -} - static void mce_clear_state(unsigned long *toclear) { int i; @@ -999,6 +1002,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; + /* If this CPU is offline, just bail out. */ + if (cpu_is_offline(smp_processor_id())) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return; + } + } + ist_enter(regs); this_cpu_inc(mce_exception_count); @@ -1089,7 +1103,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* assuming valid severity level != 0 */ m.severity = severity; - m.usable_addr = mce_usable_address(&m); mce_log(&m); @@ -1586,6 +1599,8 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) winchip_mcheck_init(c); return 1; break; + default: + return 0; } return 0; @@ -1605,6 +1620,8 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_amd_feature_init(c); mce_flags.overflow_recov = !!(ebx & BIT(0)); mce_flags.succor = !!(ebx & BIT(1)); + mce_flags.smca = !!(ebx & BIT(3)); + break; } @@ -2042,7 +2059,7 @@ int __init mcheck_init(void) * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable_error_reporting(void) +static void mce_disable_error_reporting(void) { int i; @@ -2052,17 +2069,32 @@ static int mce_disable_error_reporting(void) if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } - return 0; + return; +} + +static void vendor_disable_error_reporting(void) +{ + /* + * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Disabling them for just a single offlined CPU is bad, since it will + * inhibit reporting for all shared resources on the socket like the + * last level cache (LLC), the integrated memory controller (iMC), etc. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return; + + mce_disable_error_reporting(); } static int mce_syscore_suspend(void) { - return mce_disable_error_reporting(); + vendor_disable_error_reporting(); + return 0; } static void mce_syscore_shutdown(void) { - mce_disable_error_reporting(); + vendor_disable_error_reporting(); } /* @@ -2342,19 +2374,14 @@ static void mce_device_remove(unsigned int cpu) static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; - int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), 0); - } + vendor_disable_error_reporting(); } static void mce_reenable_cpu(void *h) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1af51b1586d7..2c5aaf8c2e2f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -503,14 +503,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - /* early Pentium M models use different method for enabling TM2 */ if (cpu_has(c, X86_FEATURE_TM2)) { if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 285c85427c32..220b1a508513 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -2,6 +2,3 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o -obj-$(CONFIG_MICROCODE_EARLY) += core_early.o -obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o -obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 12829c3ced3c..2233f8a76615 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -1,5 +1,9 @@ /* * AMD CPU Microcode Update Driver for Linux + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. * * Author: Peter Oruba <peter.oruba@amd.com> @@ -7,34 +11,31 @@ * Based on work by: * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * - * Maintainers: - * Andreas Herrmann <herrmann.der.user@googlemail.com> - * Borislav Petkov <bp@alien8.de> + * early loader: + * Copyright (C) 2013 Advanced Micro Devices, Inc. * - * This driver allows to upgrade microcode on F10h AMD - * CPUs and later. + * Author: Jacob Shin <jacob.shin@amd.com> + * Fixes: Borislav Petkov <bp@suse.de> * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ +#define pr_fmt(fmt) "microcode: " fmt -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - +#include <linux/earlycpio.h> #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/initrd.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/pci.h> +#include <asm/microcode_amd.h> #include <asm/microcode.h> #include <asm/processor.h> +#include <asm/setup.h> +#include <asm/cpu.h> #include <asm/msr.h> -#include <asm/microcode_amd.h> - -MODULE_DESCRIPTION("AMD Microcode Update Driver"); -MODULE_AUTHOR("Peter Oruba"); -MODULE_LICENSE("GPL v2"); static struct equiv_cpu_entry *equiv_cpu_table; @@ -47,6 +48,432 @@ struct ucode_patch { static LIST_HEAD(pcache); +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +static struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; +#endif + + return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; + + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; + + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +{ + struct equiv_cpu_entry *eq; + size_t *cont_sz; + u32 *header; + u8 *data, **cont; + u8 (*patch)[PATCH_MAX_SIZE]; + u16 eq_id = 0; + int offset, left; + u32 rev, eax, ebx, ecx, edx; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + cont_sz = &container_size; + cont = &container; + patch = &amd_ucode_patch; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + *cont = data; + + /* Advance past the container header */ + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; + break; + } + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + ucode = data; + } + + if (!eq_id) { + *cont = NULL; + *cont_sz = 0; + return; + } + + if (check_current_patch_level(&rev, true)) + return; + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { + rev = mc->hdr.patch_id; + *new_rev = rev; + + if (save_patch) + memcpy(patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); + } + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } +} + +static bool __init load_builtin_amd_microcode(struct cpio_data *cp, + unsigned int family) +{ +#ifdef CONFIG_X86_64 + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + return get_builtin_firmware(cp, fw_name); +#else + return false; +#endif +} + +void __init load_ucode_amd_bsp(unsigned int family) +{ + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) { + if (!load_builtin_amd_microcode(&cp, family)) + return; + } + + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size, true); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + size_t *usize; + void **ucode; + + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); + + if (!*ucode || !*usize) + return; + + apply_ucode_in_initrd(*ucode, *usize, false); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} + +static void __init get_bsp_sig(void) +{ + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; + u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; + + /* + * 64-bit runs with paging enabled, thus early==false. + */ + if (check_current_patch_level(&rev, false)) + return; + + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) + return; + + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); + } +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + unsigned long cont; + int retval = 0; + enum ucode_state ret; + u8 *cont_va; + u32 eax; + + if (!container) + return -EINVAL; + +#ifdef CONFIG_X86_32 + get_bsp_sig(); + cont = (unsigned long)container; + cont_va = __va(container); +#else + /* + * We need the physical address of the container for both bitness since + * boot_params.hdr.ramdisk_image is a physical address. + */ + cont = __pa(container); + cont_va = container; +#endif + + /* + * Take into account the fact that the ramdisk might get relocated and + * therefore we need to recompute the container's position in virtual + * memory space. + */ + if (relocated_ramdisk) + container = (u8 *)(__va(relocated_ramdisk) + + (cont - boot_params.hdr.ramdisk_image)); + else + container = cont_va; + + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + if (ret != UCODE_OK) + retval = -EINVAL; + + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + + return retval; +} + +void reload_ucode_amd(void) +{ + struct microcode_amd *mc; + u32 rev; + + /* + * early==false because this is a syscore ->resume path and by + * that time paging is long enabled. + */ + if (check_current_patch_level(&rev, false)) + return; + + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("microcode: reload patch_level=0x%08x\n", + ucode_new_rev); + } + } +} static u16 __find_equiv_id(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -177,6 +604,53 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * @rev: Use it to return the patch level. It is set to 0 in the case of + * error. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +bool check_current_patch_level(u32 *rev, bool early) +{ + u32 lvl, dummy, i; + bool ret = false; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32) && early) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) { + lvl = 0; + ret = true; + break; + } + } + + if (rev) + *rev = lvl; + + return ret; +} + int __apply_microcode_amd(struct microcode_amd *mc_amd) { u32 rev, dummy; @@ -197,7 +671,7 @@ int apply_microcode_amd(int cpu) struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; - u32 rev, dummy; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -210,7 +684,8 @@ int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (check_current_patch_level(&rev, false)) + return -1; /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { @@ -387,7 +862,7 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s if (ret != UCODE_OK) cleanup(); -#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) +#ifdef CONFIG_X86_32 /* save BSP's matching patch for early load */ if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { struct ucode_patch *p = find_patch(cpu); @@ -475,7 +950,7 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { pr_warning("AMD CPU family 0x%x not supported\n", c->x86); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c deleted file mode 100644 index e8a215a9a345..000000000000 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ /dev/null @@ -1,440 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Jacob Shin <jacob.shin@amd.com> - * Fixes: Borislav Petkov <bp@suse.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/earlycpio.h> -#include <linux/initrd.h> - -#include <asm/cpu.h> -#include <asm/setup.h> -#include <asm/microcode_amd.h> - -/* - * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. - */ -static u8 *container; -static size_t container_size; - -static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; - -static struct cpio_data ucode_cpio; - -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ - long offset = 0; - char *path; - void *start; - size_t size; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif - - return find_cpio_data(path, start, size, &offset); -} - -static size_t compute_container_size(u8 *data, u32 total_size) -{ - size_t size = 0; - u32 *header = (u32 *)data; - - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; - - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; - - while (total_size) { - u16 patch_size; - - header = (u32 *)data; - - if (header[0] != UCODE_UCODE_TYPE) - break; - - /* - * Sanity-check patch size. - */ - patch_size = header[1]; - if (patch_size > PATCH_MAX_SIZE) - break; - - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; - } - - return size; -} - -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) -{ - struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; - int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; - patch = &amd_ucode_patch; -#endif - - data = ucode; - left = size; - header = (u32 *)data; - - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return; - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - - *cont = data; - - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; - - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); - - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = *cont_sz - offset; - break; - } - - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } - - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; - } - - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } - - /* find ucode and update if needed */ - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - while (left > 0) { - struct microcode_amd *mc; - - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; - - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { - - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; - - if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } -} - -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) -{ -#ifdef CONFIG_X86_64 - char fw_name[36] = "amd-ucode/microcode_amd.bin"; - - if (family >= 0x15) - snprintf(fw_name, sizeof(fw_name), - "amd-ucode/microcode_amd_fam%.2xh.bin", family); - - return get_builtin_firmware(cp, fw_name); -#else - return false; -#endif -} - -void __init load_ucode_amd_bsp(unsigned int family) -{ - struct cpio_data cp; - void **data; - size_t *size; - -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; -#endif - - cp = find_ucode_in_initrd(); - if (!cp.data) { - if (!load_builtin_amd_microcode(&cp, family)) - return; - } - - *data = cp.data; - *size = cp.size; - - apply_ucode_in_initrd(cp.data, cp.size, true); -} - -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(void) -{ - struct microcode_amd *mc; - size_t *usize; - void **ucode; - - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } - - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); - - if (!*ucode || !*usize) - return; - - apply_ucode_in_initrd(*ucode, *usize, false); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); -} -#else -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct equiv_cpu_entry *eq; - struct microcode_amd *mc; - u32 rev, eax; - u16 eq_id; - - /* Exit if called on the BSP. */ - if (!cpu) - return; - - if (!container) - return; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - uci->cpu_sig.rev = rev; - uci->cpu_sig.sig = eax; - - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); - - eq_id = find_equiv_id(eq, eax); - if (!eq_id) - return; - - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; - } - - } else { - if (!ucode_cpio.data) - return; - - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); - } -} -#endif - -int __init save_microcode_in_initrd_amd(void) -{ - unsigned long cont; - int retval = 0; - enum ucode_state ret; - u8 *cont_va; - u32 eax; - - if (!container) - return -EINVAL; - -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa(container); - cont_va = container; -#endif - - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; - - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); - if (ret != UCODE_OK) - retval = -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - container = NULL; - container_size = 0; - - return retval; -} - -void reload_ucode_amd(void) -{ - struct microcode_amd *mc; - u32 rev, eax; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); - } - } -} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9e3f3c7dd5d7..faec7120c508 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -5,6 +5,12 @@ * 2006 Shaohua Li <shaohua.li@intel.com> * 2013-2015 Borislav Petkov <bp@alien8.de> * + * X86 CPU microcode early update for Linux: + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * (C) 2015 Borislav Petkov <bp@alien8.de> + * * This driver allows to upgrade microcode on x86 processors. * * This program is free software; you can redistribute it and/or @@ -13,34 +19,39 @@ * 2 of the License, or (at your option) any later version. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "microcode: " fmt #include <linux/platform_device.h> +#include <linux/syscore_ops.h> #include <linux/miscdevice.h> #include <linux/capability.h> +#include <linux/firmware.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/syscore_ops.h> -#include <asm/microcode.h> -#include <asm/processor.h> +#include <asm/microcode_intel.h> #include <asm/cpu_device_id.h> +#include <asm/microcode_amd.h> #include <asm/perf_event.h> +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/cmdline.h> -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "2.00" +#define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr; -module_param(dis_ucode_ldr, bool, 0); +static bool dis_ucode_ldr; + +static int __init disable_loader(char *str) +{ + dis_ucode_ldr = true; + return 1; +} +__setup("dis_ucode_ldr", disable_loader); /* * Synchronization. @@ -68,6 +79,150 @@ struct cpu_info_ctx { int err; }; +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ +#ifdef CONFIG_FW_LOADER + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (!strcmp(name, b_fw->name)) { + cd->size = b_fw->size; + cd->data = b_fw->data; + return true; + } + } +#endif + return false; +} + +void __init load_ucode_bsp(void) +{ + int vendor; + unsigned int family; + + if (check_loader_disabled_bsp()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_cpuid_vendor(); + family = x86_cpuid_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + load_ucode_amd_bsp(family); + break; + default: + break; + } +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + int vendor, family; + + if (check_loader_disabled_ap()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_cpuid_vendor(); + family = x86_cpuid_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; +} + +void reload_early_microcode(void) +{ + int vendor, family; + + vendor = x86_cpuid_vendor(); + family = x86_cpuid_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + reload_ucode_amd(); + break; + default: + break; + } +} + static void collect_cpu_info_local(void *arg) { struct cpu_info_ctx *ctx = arg; @@ -210,9 +365,6 @@ static void __exit microcode_dev_exit(void) { misc_deregister(µcode_dev); } - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) @@ -463,20 +615,6 @@ static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -#ifdef MODULE -/* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id __initconst microcode_id[] = { -#ifdef CONFIG_MICROCODE_INTEL - { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif -#ifdef CONFIG_MICROCODE_AMD - { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif - {} -}; -MODULE_DEVICE_TABLE(x86cpu, microcode_id); -#endif - static struct attribute *cpu_root_microcode_attrs[] = { &dev_attr_reload.attr, NULL @@ -487,9 +625,9 @@ static struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -static int __init microcode_init(void) +int __init microcode_init(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; int error; if (paravirt_enabled() || dis_ucode_ldr) @@ -560,35 +698,4 @@ static int __init microcode_init(void) return error; } -module_init(microcode_init); - -static void __exit microcode_exit(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - unregister_syscore_ops(&mc_syscore_ops); - - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - - get_online_cpus(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); - - microcode_ops = NULL; - - if (c->x86_vendor == X86_VENDOR_AMD) - exit_amd_microcode(); - - pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -} -module_exit(microcode_exit); +late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c deleted file mode 100644 index 8ebc421d6299..000000000000 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ /dev/null @@ -1,170 +0,0 @@ -/* - * X86 CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * (C) 2015 Borislav Petkov <bp@alien8.de> - * - * This driver allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/module.h> -#include <linux/firmware.h> -#include <asm/microcode.h> -#include <asm/microcode_intel.h> -#include <asm/microcode_amd.h> -#include <asm/processor.h> -#include <asm/cmdline.h> - -static bool __init check_loader_disabled_bsp(void) -{ -#ifdef CONFIG_X86_32 - const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); - bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); - -#else /* CONFIG_X86_64 */ - const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; - bool *res = &dis_ucode_ldr; -#endif - - if (cmdline_find_option_bool(cmdline, option)) - *res = true; - - return *res; -} - -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; - -bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ -#ifdef CONFIG_FW_LOADER - struct builtin_fw *b_fw; - - for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { - if (!strcmp(name, b_fw->name)) { - cd->size = b_fw->size; - cd->data = b_fw->data; - return true; - } - } -#endif - return false; -} - -void __init load_ucode_bsp(void) -{ - int vendor; - unsigned int family; - - if (check_loader_disabled_bsp()) - return; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - load_ucode_intel_bsp(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); - break; - default: - break; - } -} - -static bool check_loader_disabled_ap(void) -{ -#ifdef CONFIG_X86_32 - return *((bool *)__pa_nodebug(&dis_ucode_ldr)); -#else - return dis_ucode_ldr; -#endif -} - -void load_ucode_ap(void) -{ - int vendor, family; - - if (check_loader_disabled_ap()) - return; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - load_ucode_intel_ap(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_ap(); - break; - default: - break; - } -} - -int __init save_microcode_in_initrd(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - if (c->x86 >= 6) - save_microcode_in_initrd_intel(); - break; - case X86_VENDOR_AMD: - if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); - break; - default: - break; - } - - return 0; -} - -void reload_early_microcode(void) -{ - int vendor, family; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - reload_ucode_intel(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - reload_ucode_amd(); - break; - default: - break; - } -} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 969dc17eb1b4..ee81c544ee0d 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -4,27 +4,800 @@ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * 2006 Shaohua Li <shaohua.li@intel.com> * + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * This needs to be before all headers so that pr_debug in printk.h doesn't turn + * printk calls into no_printk(). + * + *#define DEBUG + */ +#define pr_fmt(fmt) "microcode: " fmt +#include <linux/earlycpio.h> #include <linux/firmware.h> #include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> #include <linux/vmalloc.h> +#include <linux/initrd.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/mm.h> #include <asm/microcode_intel.h> #include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> #include <asm/msr.h> -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); -MODULE_LICENSE("GPL"); +static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +static struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +load_microcode_early(struct microcode_intel **saved, + unsigned int num_saved, struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + struct microcode_header_intel *mc_hdr; + int new_rev, ret, i; + + new_rev = uci->cpu_sig.rev; + + for (i = 0; i < num_saved; i++) { + ucode_ptr = saved[i]; + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + ret = has_newer_microcode(ucode_ptr, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev); + if (!ret) + continue; + + new_rev = mc_hdr->rev; + new_mc = ucode_ptr; + } + + if (!new_mc) + return UCODE_NFOUND; + + uci->mc = (struct microcode_intel *)new_mc; + return UCODE_OK; +} + +static inline void +copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, + unsigned long off, int num_saved) +{ + int i; + + for (i = 0; i < num_saved; i++) + mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_nodebug(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + } +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long initrd_start, struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + + return load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return load_microcode_early(mc_saved_tmp, count, uci); +#else + return load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + unsigned int fam, model; + unsigned int fam_ucode, model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + fam = x86_family(sig); + model = x86_model(sig); + + fam_ucode = x86_family(mc_header->sig); + model_ucode = x86_model(mc_header->sig); + + if (fam == fam_ucode && model == model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + ext_sigcount = ext_header->count; + + for (i = 0; i < ext_sigcount; i++) { + fam_ucode = x86_family(ext_sig->sig); + model_ucode = x86_model(ext_sig->sig); + + if (fam == fam_ucode && model == model_ucode) + return UCODE_OK; + + ext_sig++; + } + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **saved_ptr; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + if (!saved_ptr) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_header_intel *mc_hdr; + struct microcode_intel *mc; + unsigned long size; + + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + + mc = mc_saved_src[i]; + mc_hdr = &mc->hdr; + size = get_totalsize(mc_hdr); + + saved_ptr[i] = kmalloc(size, GFP_KERNEL); + if (!saved_ptr[i]) { + ret = -ENOMEM; + goto err; + } + + memcpy(saved_ptr[i], mc, size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = saved_ptr; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(saved_ptr[j]); + kfree(saved_ptr); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + * + * Returns: The updated number @num_saved of saved microcode patches. + */ +static unsigned int _save_mc(struct microcode_intel **mc_saved, + u8 *ucode_ptr, unsigned int num_saved) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + unsigned int sig, pf; + int found = 0, i; + + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + for (i = 0; i < num_saved; i++) { + mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (!find_matching_signature(ucode_ptr, sig, pf)) + continue; + + found = 1; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + /* + * Found an older ucode saved earlier. Replace it with + * this newer one. + */ + mc_saved[i] = (struct microcode_intel *)ucode_ptr; + break; + } + + /* Newly detected microcode, save it to memory. */ + if (i >= num_saved && !found) + mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + + return num_saved; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + + if (leftover < sizeof(mc_header)) + break; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(csig.sig); + model = x86_model(csig.sig); + + if ((model >= 5) || (family > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +static void show_saved_mc(void) +{ +#ifdef DEBUG + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no microcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +#endif +} + +#ifdef CONFIG_HOTPLUG_CPU +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + mutex_lock(&x86_cpu_microcode_mutex); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct microcode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Cannot save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcode data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + mutex_unlock(&x86_cpu_microcode_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +{ +#ifdef CONFIG_X86_64 + unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + char name[30]; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + x86_family(eax), x86_model(eax), x86_stepping(eax)); + + return get_builtin_firmware(cp, name); +#else + return false; +#endif +} + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long start, unsigned long size, + struct ucode_cpu_info *uci) +{ + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_nodebug(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) { + if (!load_builtin_intel_microcode(&cd)) + return UCODE_ERROR; + } + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, initrd, uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc_intel->hdr.date); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Cannot save microcode patches from initrd.\n"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *initrd, + unsigned long start, unsigned long size) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + collect_cpu_info_early(&uci); + + ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + if (ret != UCODE_OK) + return; + + ret = load_microcode(mc_saved_data, initrd, start, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); +} + +void __init load_ucode_intel_bsp(void) +{ + u64 start, size; +#ifdef CONFIG_X86_32 + struct boot_params *p; + + p = (struct boot_params *)__pa_nodebug(&boot_params); + start = p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); +#else + start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; + size = boot_params.hdr.ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); +#endif +} + +void load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; + enum ucode_state ret; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); +} + +void reload_ucode_intel(void) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + if (!mc_saved_data.mc_saved_count) + return; + + collect_cpu_info_early(&uci); + + ret = load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, false); +} static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { @@ -264,7 +1037,7 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c deleted file mode 100644 index 37ea89c11520..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ /dev/null @@ -1,808 +0,0 @@ -/* - * Intel CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * - * This allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * This needs to be before all headers so that pr_debug in printk.h doesn't turn - * printk calls into no_printk(). - * - *#define DEBUG - */ - -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/earlycpio.h> -#include <linux/initrd.h> -#include <linux/cpu.h> -#include <asm/msr.h> -#include <asm/microcode_intel.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/setup.h> - -#undef pr_fmt -#define pr_fmt(fmt) "microcode: " fmt - -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; -static struct mc_saved_data { - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; -} mc_saved_data; - -static enum ucode_state -load_microcode_early(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) -{ - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } - - if (!new_mc) - return UCODE_NFOUND; - - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; -} - -static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) -{ - int i; - - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) -{ - int i; - struct microcode_intel ***mc_saved; - - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { - struct microcode_intel *p; - - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); - } -} -#endif - -static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; - - if (!mc_saved_data->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); - - return load_microcode_early(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); - return load_microcode_early(mc_saved_tmp, count, uci); -#else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); -#endif - } -} - -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - fam = __x86_family(sig); - model = x86_model(sig); - - fam_ucode = __x86_family(mc_header->sig); - model_ucode = x86_model(mc_header->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; - - ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - ext_sigcount = ext_header->count; - - for (i = 0; i < ext_sigcount; i++) { - fam_ucode = __x86_family(ext_sig->sig); - model_ucode = x86_model(ext_sig->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - ext_sig++; - } - return UCODE_NFOUND; -} - -static int -save_microcode(struct mc_saved_data *mc_saved_data, - struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) -{ - int i, j; - struct microcode_intel **saved_ptr; - int ret; - - if (!mc_saved_count) - return -EINVAL; - - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < mc_saved_count; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } - - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); - - saved_ptr[i] = kmalloc(size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; - } - - memcpy(saved_ptr[i], mc, size); - } - - /* - * Point to newly saved microcode. - */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; -} - -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) -{ - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; - - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; - - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; - - found = 1; - - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; - - /* - * Found an older ucode saved earlier. Replace it with - * this newer one. - */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; - } - - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; - - return num_saved; -} - -/* - * Get microcode matching with BSP's model. Only CPUs with the same model as - * BSP can stay in the platform. - */ -static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) -{ - u8 *ucode_ptr = data; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; - int i; - - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { - - if (leftover < sizeof(mc_header)) - break; - - mc_header = (struct microcode_header_intel *)ucode_ptr; - - mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) - break; - - leftover -= mc_size; - - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { - ucode_ptr += mc_size; - continue; - } - - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); - - ucode_ptr += mc_size; - } - - if (leftover) { - state = UCODE_ERROR; - goto out; - } - - if (mc_saved_count == 0) { - state = UCODE_NFOUND; - goto out; - } - - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; - - mc_saved_data->mc_saved_count = mc_saved_count; -out: - return state; -} - -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig; - unsigned int eax, ebx, ecx, edx; - - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = __x86_family(csig.sig); - model = x86_model(csig.sig); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - -#ifdef DEBUG -static void show_saved_mc(void) -{ - int i, j; - unsigned int sig, pf, rev, total_size, data_size, date; - struct ucode_cpu_info uci; - - if (mc_saved_data.mc_saved_count == 0) { - pr_debug("no microcode data saved.\n"); - return; - } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); - - collect_cpu_info_early(&uci); - - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; - pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { - struct microcode_header_intel *mc_saved_header; - struct extended_sigtable *ext_header; - int ext_sigcount; - struct extended_signature *ext_sig; - - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; - - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - continue; - - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (j = 0; j < ext_sigcount; j++) { - sig = ext_sig->sig; - pf = ext_sig->pf; - - pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", - j, sig, pf); - - ext_sig++; - } - - } -} -#else -static inline void show_saved_mc(void) -{ -} -#endif - -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) -static DEFINE_MUTEX(x86_cpu_microcode_mutex); -/* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. - */ -int save_mc_for_early(u8 *mc) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; - int ret = 0; - int i; - - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ - mutex_lock(&x86_cpu_microcode_mutex); - - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && mc_saved_count) - memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - - show_saved_mc(); - - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: - mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); -#endif - -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) -{ -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; - unsigned int family, model, stepping; - char name[30]; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - family = __x86_family(eax); - model = x86_model(eax); - stepping = eax & 0xf; - - sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); - - return get_builtin_firmware(cp, name); -#else - return false; -#endif -} - -static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long start, unsigned long size, - struct ucode_cpu_info *uci) -{ - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { - if (!load_builtin_intel_microcode(&cd)) - return UCODE_ERROR; - } - - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); -} - -/* - * Print ucode update info. - */ -static void -print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) -{ - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); -} - -#ifdef CONFIG_X86_32 - -static int delay_ucode_info; -static int current_mc_date; - -/* - * Print early updated ucode info after printk works. This is delayed info dump. - */ -void show_ucode_info_early(void) -{ - struct ucode_cpu_info uci; - - if (delay_ucode_info) { - collect_cpu_info_early(&uci); - print_ucode_info(&uci, current_mc_date); - delay_ucode_info = 0; - } -} - -/* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in - * show_ucode_info_early() until printk() works. - */ -static void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - int *delay_ucode_info_p; - int *current_mc_date_p; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); - current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); - - *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; -} -#else - -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - -static inline void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - print_ucode_info(uci, mc_intel->hdr.date); -} -#endif - -static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) -{ - struct microcode_intel *mc_intel; - unsigned int val[2]; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return 0; - - /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) - return -1; - -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif - uci->cpu_sig.rev = val[1]; - - if (early) - print_ucode(uci); - else - print_ucode_info(uci, mc_intel->hdr.date); - - return 0; -} - -/* - * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. - */ -int __init save_microcode_in_initrd_intel(void) -{ - unsigned int count = mc_saved_data.mc_saved_count; - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; - - if (count == 0) - return ret; - - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); - - return ret; -} - -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, - unsigned long start, unsigned long size) -{ - struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); - if (ret != UCODE_OK) - return; - - ret = load_microcode(mc_saved_data, initrd, start, &uci); - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, true); -} - -void __init load_ucode_intel_bsp(void) -{ - u64 start, size; -#ifdef CONFIG_X86_32 - struct boot_params *p; - - p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; - - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); -#else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; - size = boot_params.hdr.ramdisk_size; - - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); -#endif -} - -void load_ucode_intel_ap(void) -{ - struct mc_saved_data *mc_saved_data_p; - struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; - enum ucode_state ret; -#ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); -#else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (mc_saved_data_p->mc_saved_count == 0) - return; - - collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, true); -} - -void reload_ucode_intel(void) -{ - struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.mc_saved_count) - return; - - collect_cpu_info_early(&uci); - - ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, false); -} diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 1883d252ff7d..b96896bcbdaf 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -25,7 +25,6 @@ #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/kernel.h> -#include <linux/module.h> #include <asm/microcode_intel.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 70d7c93f4550..0d98503c2245 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -593,9 +593,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct range range_new[RANGE_NUM]; + /* + * range_new should really be an automatic variable, but + * putting 4096 bytes on the stack is frowned upon, to put it + * mildly. It is safe to make it a static __initdata variable, + * since mtrr_calc_range_state is only called during init and + * there's no way it will call itself recursively. + */ + static struct range range_new[RANGE_NUM] __initdata; unsigned long range_sums_new; - static int nr_range_new; + int nr_range_new; int num_reg; /* Convert ranges to var ranges state: */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3b533cf37c74..c870af161008 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (cpu_has_mtrr) + if (boot_cpu_has(X86_FEATURE_MTRR)) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index f891b4750f04..5c3d149ee91c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -682,7 +682,7 @@ void __init mtrr_bp_init(void) phys_addr = 32; - if (cpu_has_mtrr) { + if (boot_cpu_has(X86_FEATURE_MTRR)) { mtrr_if = &generic_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(36); size_and_mask = 0x00f00000; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66dd3fe99b82..1b443db2db50 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * @@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event) /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; + + if (x86_pmu.pebs_prec_dist) + precise++; } if (event->attr.precise_ip > precise) @@ -1175,7 +1178,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1326,7 +1329,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) return; /* @@ -1531,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs) { struct device_attribute *d; struct perf_pmu_events_attr *pmu_attr; + int offset = 0; int i, j; for (i = 0; attrs[i]; i++) { @@ -1539,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs) /* str trumps id */ if (pmu_attr->event_str) continue; - if (x86_pmu.event_map(i)) + if (x86_pmu.event_map(i + offset)) continue; for (j = i; attrs[j]; j++) @@ -1547,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs) /* Check the shifted attr. */ i--; + + /* + * event_map() is index based, the attrs array is organized + * by increasing event index. If we shift the events, then + * we need to compensate for the event_map(), otherwise + * we are looking up the wrong event in the map + */ + offset++; } } @@ -1748,11 +1760,22 @@ static inline void x86_pmu_read(struct perf_event *event) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void x86_pmu_start_txn(struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ + + cpuc->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); - __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); __this_cpu_write(cpu_hw_events.n_txn, 0); } @@ -1763,7 +1786,16 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { - __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); + unsigned int txn_flags; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + txn_flags = cpuc->txn_flags; + cpuc->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). @@ -1786,6 +1818,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu) int assign[X86_PMC_IDX_MAX]; int n, ret; + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuc->txn_flags = 0; + return 0; + } + n = cpuc->n_events; if (!x86_pmu_initialized()) @@ -1801,7 +1840,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - cpuc->group_flag &= ~PERF_EVENT_TXN; + cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } @@ -2223,12 +2262,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); + pagefault_disable(); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (!access_ok(VERIFY_READ, fp, 8)) + break; + + bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); + if (bytes != 0) + break; + bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4); if (bytes != 0) break; @@ -2238,6 +2284,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } + pagefault_enable(); return 1; } #else @@ -2275,12 +2322,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (perf_callchain_user32(regs, entry)) return; + pagefault_disable(); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (!access_ok(VERIFY_READ, fp, 16)) + break; + + bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8); + if (bytes != 0) + break; + bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8); if (bytes != 0) break; @@ -2288,8 +2342,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) break; perf_callchain_store(entry, frame.return_address); - fp = frame.next_frame; + fp = (void __user *)frame.next_frame; } + pagefault_enable(); } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 165be83a7fa4..7bb61e32fb29 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * @@ -14,17 +14,7 @@ #include <linux/perf_event.h> -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - unsigned int _msr = (msr); \ - u64 _val = (val); \ - trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ - (unsigned long long)(_val)); \ - native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ -} while (0) -#endif +/* To enable MSR tracing please use the generic trace points. */ /* * | NHM/WSM | SNB | @@ -196,7 +186,7 @@ struct cpu_hw_events { int n_excl; /* the number of exclusive events */ - unsigned int group_flag; + unsigned int txn_flags; int is_fake; /* @@ -318,6 +308,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +/* Constraint on specific umask bit only + event */ +#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c)) + /* Like UEVENT_CONSTRAINT, but match flags too */ #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) @@ -387,7 +381,7 @@ struct cpu_hw_events { /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) @@ -589,7 +583,8 @@ struct x86_pmu { bts_active :1, pebs :1, pebs_active :1, - pebs_broken :1; + pebs_broken :1, + pebs_prec_dist :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; @@ -627,6 +622,7 @@ struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; + int tos; int lbr_callstack_users; int lbr_stack_state; }; @@ -906,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void); void intel_pmu_lbr_init_skl(void); +void intel_pmu_lbr_init_knl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 1cee5d2d7ece..58610539b048 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_ACCESS) ] = 0, [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -160,7 +160,7 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel) if (offset) return offset; - if (!cpu_has_perfctr_core) + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) offset = index; else offset = index << 1; @@ -652,7 +652,7 @@ static __initconst const struct x86_pmu amd_pmu = { static int __init amd_core_pmu_init(void) { - if (!cpu_has_perfctr_core) + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; switch (boot_cpu_data.x86) { diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index cc6cedb8f25d..49742746a6c9 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -523,10 +523,10 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) goto fail_nodev; - if (!cpu_has_topoext) + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) goto fail_nodev; - if (cpu_has_perfctr_nb) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { ret = -ENOMEM; @@ -540,7 +540,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (cpu_has_perfctr_l2) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { amd_uncore_l2 = alloc_percpu(struct amd_uncore *); if (!amd_uncore_l2) { ret = -ENOMEM; @@ -583,10 +583,11 @@ fail_online: /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ amd_uncore_nb = amd_uncore_l2 = NULL; - if (cpu_has_perfctr_l2) + + if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) perf_pmu_unregister(&amd_l2_pmu); fail_l2: - if (cpu_has_perfctr_nb) + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); if (amd_uncore_l2) free_percpu(amd_uncore_l2); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f63360be2238..fed2ab1f1065 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_knl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, + MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, + MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1), + EVENT_EXTRA_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), @@ -232,7 +240,7 @@ static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ @@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ + INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ EVENT_CONSTRAINT_END }; @@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ +#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ +#define KNL_MCDRAM_LOCAL BIT_ULL(21) +#define KNL_MCDRAM_FAR BIT_ULL(22) +#define KNL_DDR_LOCAL BIT_ULL(23) +#define KNL_DDR_FAR BIT_ULL(24) +#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \ + KNL_DDR_LOCAL | KNL_DDR_FAR) +#define KNL_L2_READ SLM_DMND_READ +#define KNL_L2_WRITE SLM_DMND_WRITE +#define KNL_L2_PREFETCH SLM_DMND_PREFETCH +#define KNL_L2_ACCESS SLM_LLC_ACCESS +#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \ + KNL_DRAM_ANY | SNB_SNP_ANY | \ + SNB_NON_DRAM) + +static __initconst const u64 knl_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS, + [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS, + [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS, + }, + }, +}; + /* * Use from PMIs where the LBRs are already disabled. */ @@ -1916,7 +1960,8 @@ intel_bts_constraints(struct perf_event *event) static int intel_alt_er(int idx, u64 config) { - int alt_idx; + int alt_idx = idx; + if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; @@ -2475,6 +2520,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event) } } +static void intel_pebs_aliases_precdist(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.PREC_DIST + * (0x01c0), which is a PEBS capable event, to get the same + * count. + * + * The PREC_DIST event has special support to minimize sample + * shadowing effects. One drawback is that it can be + * only programmed on counter 1, but that seems like an + * acceptable trade off. + */ + u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_ivb(struct perf_event *event) +{ + if (event->attr.precise_ip < 3) + return intel_pebs_aliases_snb(event); + return intel_pebs_aliases_precdist(event); +} + +static void intel_pebs_aliases_skl(struct perf_event *event) +{ + if (event->attr.precise_ip < 3) + return intel_pebs_aliases_core2(event); + return intel_pebs_aliases_precdist(event); +} + static unsigned long intel_pmu_free_running_flags(struct perf_event *event) { unsigned long flags = x86_pmu.free_running_flags; @@ -2815,14 +2898,12 @@ static void intel_pmu_cpu_starting(int cpu) return; if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { - void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; - for_each_cpu(i, topology_sibling_cpumask(cpu)) { struct intel_shared_regs *pc; pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - *onln = cpuc->shared_regs; + cpuc->kfree_on_online[0] = cpuc->shared_regs; cpuc->shared_regs = pc; break; } @@ -3332,6 +3413,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_gen_event_constraints; x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); break; @@ -3431,7 +3513,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_ivb_event_constraints; x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; + x86_pmu.pebs_prec_dist = true; if (boot_cpu_data.x86_model == 62) x86_pmu.extra_regs = intel_snbep_extra_regs; else @@ -3464,7 +3547,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; x86_pmu.extra_regs = intel_snbep_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; + x86_pmu.pebs_prec_dist = true; /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; @@ -3499,7 +3583,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_bdw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; x86_pmu.extra_regs = intel_snbep_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.pebs_aliases = intel_pebs_aliases_ivb; + x86_pmu.pebs_prec_dist = true; /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; @@ -3511,6 +3596,24 @@ __init int intel_pmu_init(void) pr_cont("Broadwell events, "); break; + case 87: /* Knights Landing Xeon Phi */ + memcpy(hw_cache_event_ids, + slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, + knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + intel_pmu_lbr_init_knl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_knl_extra_regs; + + /* all extra regs are per-cpu when HT is on */ + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + pr_cont("Knights Landing events, "); + break; + case 78: /* 14nm Skylake Mobile */ case 94: /* 14nm Skylake Desktop */ x86_pmu.late_ack = true; @@ -3521,7 +3624,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_skl_event_constraints; x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; x86_pmu.extra_regs = intel_skl_extra_regs; - x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.pebs_aliases = intel_pebs_aliases_skl; + x86_pmu.pebs_prec_dist = true; /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index d1c0f254afbe..2cad71d1b14c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -495,6 +495,19 @@ static int bts_event_init(struct perf_event *event) if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; + /* + * BTS leaks kernel addresses even when CPL0 tracing is + * disabled, so disallow intel_bts driver for unprivileged + * users on paranoid systems since it provides trace data + * to the user in a zero-copy fashion. + * + * Note that the default paranoia setting permits unprivileged + * users to profile the kernel. + */ + if (event->attr.exclude_kernel && perf_paranoid_kernel() && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = x86_reserve_hardware(); if (ret) { x86_del_exclusive(x86_lbr_exclusive_bts); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c new file mode 100644 index 000000000000..75a38b5a2e26 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cstate.c @@ -0,0 +1,694 @@ +/* + * perf_event_intel_cstate.c: support cstate residency counters + * + * Copyright (C) 2015, Intel Corp. + * Author: Kan Liang (kan.liang@intel.com) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + */ + +/* + * This file export cstate related free running (read-only) counters + * for perf. These counters may be use simultaneously by other tools, + * such as turbostat. However, it still make sense to implement them + * in perf. Because we can conveniently collect them together with + * other events, and allow to use them from tools without special MSR + * access code. + * + * The events only support system-wide mode counting. There is no + * sampling support because it is not supported by the hardware. + * + * According to counters' scope and category, two PMUs are registered + * with the perf_event core subsystem. + * - 'cstate_core': The counter is available for each physical core. + * The counters include CORE_C*_RESIDENCY. + * - 'cstate_pkg': The counter is available for each physical package. + * The counters include PKG_C*_RESIDENCY. + * + * All of these counters are specified in the Intel® 64 and IA-32 + * Architectures Software Developer.s Manual Vol3b. + * + * Model specific counters: + * MSR_CORE_C1_RES: CORE C1 Residency Counter + * perf code: 0x00 + * Available model: SLM,AMT + * Scope: Core (each processor core has a MSR) + * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter + * perf code: 0x03 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. + * perf code: 0x00 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. + * perf code: 0x03 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. + * perf code: 0x04 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. + * perf code: 0x05 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. + * perf code: 0x06 + * Available model: HSW ULT only + * Scope: Package (physical package) + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __cstate_##_var##_show, NULL) + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf); + +struct perf_cstate_msr { + u64 msr; + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); +}; + + +/* cstate_core PMU */ + +static struct pmu cstate_core_pmu; +static bool has_cstate_core; + +enum perf_cstate_core_id { + /* + * cstate_core events + */ + PERF_CSTATE_CORE_C1_RES = 0, + PERF_CSTATE_CORE_C3_RES, + PERF_CSTATE_CORE_C6_RES, + PERF_CSTATE_CORE_C7_RES, + + PERF_CSTATE_CORE_EVENT_MAX, +}; + +bool test_core(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C1_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); + +static struct perf_cstate_msr core_msr[] = { + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, +}; + +static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group core_events_attr_group = { + .name = "events", + .attrs = core_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); +static struct attribute *core_format_attrs[] = { + &format_attr_core_event.attr, + NULL, +}; + +static struct attribute_group core_format_attr_group = { + .name = "format", + .attrs = core_format_attrs, +}; + +static cpumask_t cstate_core_cpu_mask; +static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); + +static struct attribute *cstate_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = cstate_cpumask_attrs, +}; + +static const struct attribute_group *core_attr_groups[] = { + &core_events_attr_group, + &core_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_core PMU end */ + + +/* cstate_pkg PMU */ + +static struct pmu cstate_pkg_pmu; +static bool has_cstate_pkg; + +enum perf_cstate_pkg_id { + /* + * cstate_pkg events + */ + PERF_CSTATE_PKG_C2_RES = 0, + PERF_CSTATE_PKG_C3_RES, + PERF_CSTATE_PKG_C6_RES, + PERF_CSTATE_PKG_C7_RES, + PERF_CSTATE_PKG_C8_RES, + PERF_CSTATE_PKG_C9_RES, + PERF_CSTATE_PKG_C10_RES, + + PERF_CSTATE_PKG_EVENT_MAX, +}; + +bool test_pkg(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 69: /* 22nm Haswell ULT */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES || + idx == PERF_CSTATE_PKG_C8_RES || + idx == PERF_CSTATE_PKG_C9_RES || + idx == PERF_CSTATE_PKG_C10_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); +PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); +PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); + +static struct perf_cstate_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, +}; + +static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group pkg_events_attr_group = { + .name = "events", + .attrs = pkg_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); +static struct attribute *pkg_format_attrs[] = { + &format_attr_pkg_event.attr, + NULL, +}; +static struct attribute_group pkg_format_attr_group = { + .name = "format", + .attrs = pkg_format_attrs, +}; + +static cpumask_t cstate_pkg_cpu_mask; + +static const struct attribute_group *pkg_attr_groups[] = { + &pkg_events_attr_group, + &pkg_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_pkg PMU end*/ + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu == &cstate_core_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); + else if (pmu == &cstate_pkg_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); + else + return 0; +} + +static int cstate_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + int ret = 0; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + if (event->pmu == &cstate_core_pmu) { + if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) + return -EINVAL; + if (!core_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = core_msr[cfg].msr; + } else if (event->pmu == &cstate_pkg_pmu) { + if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) + return -EINVAL; + if (!pkg_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = pkg_msr[cfg].msr; + } else + return -ENOENT; + + /* must be done before validate_group */ + event->hw.config = cfg; + event->hw.idx = -1; + + return ret; +} + +static inline u64 cstate_pmu_read_counter(struct perf_event *event) +{ + u64 val; + + rdmsrl(event->hw.event_base, val); + return val; +} + +static void cstate_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = cstate_pmu_read_counter(event); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + local64_add(new_raw_count - prev_raw_count, &event->count); +} + +static void cstate_pmu_event_start(struct perf_event *event, int mode) +{ + local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event)); +} + +static void cstate_pmu_event_stop(struct perf_event *event, int mode) +{ + cstate_pmu_event_update(event); +} + +static void cstate_pmu_event_del(struct perf_event *event, int mode) +{ + cstate_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int cstate_pmu_event_add(struct perf_event *event, int mode) +{ + if (mode & PERF_EF_START) + cstate_pmu_event_start(event, mode); + + return 0; +} + +static void cstate_cpu_exit(int cpu) +{ + int i, id, target; + + /* cpu exit for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_core_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_core_cpu_mask); + WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } + + /* cpu exit for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_physical_package_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_pkg_cpu_mask); + WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } +} + +static void cstate_cpu_init(int cpu) +{ + int i, id; + + /* cpu init for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + for_each_cpu(i, &cstate_core_cpu_mask) { + if (id == topology_core_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + } + + /* cpu init for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + for_each_cpu(i, &cstate_pkg_cpu_mask) { + if (id == topology_physical_package_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); + } +} + +static int cstate_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + break; + case CPU_STARTING: + cstate_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + break; + case CPU_ONLINE: + case CPU_DEAD: + break; + case CPU_DOWN_PREPARE: + cstate_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +/* + * Probe the cstate events and insert the available one into sysfs attrs + * Return false if there is no available events. + */ +static bool cstate_probe_msr(struct perf_cstate_msr *msr, + struct attribute **events_attrs, + int max_event_nr) +{ + int i, j = 0; + u64 val; + + /* Probe the cstate events. */ + for (i = 0; i < max_event_nr; i++) { + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; + } + + /* List remaining events in the sysfs attrs. */ + for (i = 0; i < max_event_nr; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; + } + events_attrs[j] = NULL; + + return (j > 0) ? true : false; +} + +static int __init cstate_init(void) +{ + /* SLM has different MSR for PKG C6 */ + switch (boot_cpu_data.x86_model) { + case 55: + case 76: + case 77: + pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + } + + if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) + has_cstate_core = true; + + if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) + has_cstate_pkg = true; + + return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; +} + +static void __init cstate_cpumask_init(void) +{ + int cpu; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); + + __perf_cpu_notifier(cstate_cpu_notifier); + + cpu_notifier_register_done(); +} + +static struct pmu cstate_core_pmu = { + .attr_groups = core_attr_groups, + .name = "cstate_core", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static struct pmu cstate_pkg_pmu = { + .attr_groups = pkg_attr_groups, + .name = "cstate_pkg", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static void __init cstate_pmus_register(void) +{ + int err; + + if (has_cstate_core) { + err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_core_pmu.name, err); + } + + if (has_cstate_pkg) { + err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_pkg_pmu.name, err); + } +} + +static int __init cstate_pmu_init(void) +{ + int err; + + if (cpu_has_hypervisor) + return -ENODEV; + + err = cstate_init(); + if (err) + return err; + + cstate_cpumask_init(); + + cstate_pmus_register(); + + return 0; +} + +device_initcall(cstate_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 84f236ab96b0..10602f0a438f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -510,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void) u64 flags; }; struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; + struct bts_record *at, *base, *top; struct perf_output_handle handle; struct perf_event_header header; struct perf_sample_data data; + unsigned long skip = 0; struct pt_regs regs; if (!event) @@ -522,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void) if (!x86_pmu.bts_active) return 0; - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; + base = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; - if (top <= at) + if (top <= base) return 0; memset(®s, 0, sizeof(regs)); @@ -535,16 +536,43 @@ int intel_pmu_drain_bts_buffer(void) perf_sample_data_init(&data, 0, event->hw.last_period); /* + * BTS leaks kernel addresses in branches across the cpl boundary, + * such as traps or system calls, so unless the user is asking for + * kernel tracing (and right now it's not possible), we'd need to + * filter them out. But first we need to count how many of those we + * have in the current batch. This is an extra O(n) pass, however, + * it's much faster than the other one especially considering that + * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the + * alloc_bts_buffer()). + */ + for (at = base; at < top; at++) { + /* + * Note that right now *this* BTS code only works if + * attr::exclude_kernel is set, but let's keep this extra + * check here in case that changes. + */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + skip++; + } + + /* * Prepare a generic sample, i.e. fill in the invariant fields. * We will overwrite the from and to address before we output * the sample. */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at))) + if (perf_output_begin(&handle, event, header.size * + (top - base - skip))) return 1; - for (; at < top; at++) { + for (at = base; at < top; at++) { + /* Filter out any records that contain kernel addresses. */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + continue; + data.ip = at->from; data.addr = at->to; @@ -592,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END }; @@ -658,6 +688,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -672,6 +704,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ @@ -690,9 +724,10 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { struct event_constraint intel_skl_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2), + /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ @@ -1073,6 +1108,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) void *at; u64 pebs_status; + /* + * fmt0 does not have a status bitfield (does not use + * perf_record_nhm format) + */ + if (x86_pmu.intel_cap.pebs_format < 1) + return base; + if (base == NULL) return NULL; @@ -1158,7 +1200,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) if (!event->attr.precise_ip) return; - n = (top - at) / x86_pmu.pebs_record_size; + n = top - at; if (n <= 0) return; @@ -1202,12 +1244,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) pebs_status = p->status & cpuc->pebs_enabled; pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + /* + * On some CPUs the PEBS status can be zero when PEBS is + * racing with clearing of GLOBAL_STATUS. + * + * Normally we would drop that record, but in the + * case when there is only a single active PEBS event + * we can assume it's for that event. + */ + if (!pebs_status && cpuc->pebs_enabled && + !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) + pebs_status = cpuc->pebs_enabled; + bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); - if (WARN(bit >= x86_pmu.max_pebs_events, - "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx", - (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled, - *(unsigned long long *)cpuc->active_mask)) + if (bit >= x86_pmu.max_pebs_events) continue; /* diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index b2c9475b7ff2..653f88d25987 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -42,6 +42,13 @@ static enum { #define LBR_FAR_BIT 8 /* do not capture far branches */ #define LBR_CALL_STACK_BIT 9 /* enable call stack */ +/* + * Following bit only exists in Linux; we mask it out before writing it to + * the actual MSR. But it helps the constraint perf code to understand + * that this is a separate configuration. + */ +#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */ + #define LBR_KERNEL (1 << LBR_KERNEL_BIT) #define LBR_USER (1 << LBR_USER_BIT) #define LBR_JCC (1 << LBR_JCC_BIT) @@ -52,6 +59,7 @@ static enum { #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) #define LBR_FAR (1 << LBR_FAR_BIT) #define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) +#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT) #define LBR_PLM (LBR_KERNEL | LBR_USER) @@ -151,10 +159,10 @@ static void __intel_pmu_lbr_enable(bool pmi) * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ - if (cpuc->lbr_sel && !pmi) { - lbr_select = cpuc->lbr_sel->config; + if (cpuc->lbr_sel) + lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; + if (!pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); - } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; @@ -239,7 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } mask = x86_pmu.lbr_nr - 1; - tos = intel_pmu_lbr_tos(); + tos = task_ctx->tos; for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); @@ -247,6 +255,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } @@ -270,6 +279,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; } @@ -420,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) */ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { + bool need_info = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); @@ -427,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int out = 0; int num = x86_pmu.lbr_nr; - if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + if (cpuc->lbr_sel) { + need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + } for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; @@ -440,7 +454,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); - if (lbr_format == LBR_FORMAT_INFO) { + if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); @@ -555,6 +569,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) mask |= X86_BR_IND_JMP; + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -586,6 +602,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if (v != LBR_IGN) mask |= v; } + reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; @@ -596,6 +613,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) */ reg->config = mask ^ x86_pmu.lbr_sel_mask; + if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && + (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && + (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) + reg->config |= LBR_NO_INFO; + return 0; } @@ -890,6 +912,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -905,6 +928,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | LBR_CALL_STACK, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; /* core */ @@ -1022,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void) */ pr_cont("8-deep LBR, "); } + +/* Knights Landing */ +void intel_pmu_lbr_init_knl(void) +{ + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + pr_cont("8-deep LBR, "); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 42169283448b..c0bbd1033b7c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -27,6 +27,7 @@ #include <asm/perf_event.h> #include <asm/insn.h> #include <asm/io.h> +#include <asm/intel_pt.h> #include "perf_event.h" #include "intel_pt.h" @@ -139,9 +140,6 @@ static int __init pt_pmu_hw_init(void) long i; attrs = NULL; - ret = -ENODEV; - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) - goto fail; for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, @@ -1125,11 +1123,23 @@ static int pt_event_init(struct perf_event *event) return 0; } +void cpu_emergency_stop_pt(void) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + + if (pt->handle.event) + pt_event_stop(pt->handle.event, PERF_EF_UPDATE); +} + static __init int pt_init(void) { int ret, cpu, prior_warn = 0; BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); + + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + return -ENODEV; + get_online_cpus(); for_each_online_cpu(cpu) { u64 ctl; diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 81431c0f0614..24a351ad628d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -63,7 +63,7 @@ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ #define NR_RAPL_DOMAINS 0x4 -static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { +static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", "dram", @@ -107,19 +107,13 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __rapl_##_var##_show, NULL) -#define RAPL_EVENT_DESC(_name, _config) \ -{ \ - .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ - .config = _config, \ -} - #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ -#define RAPL_EVENT_ATTR_STR(_name, v, str) \ -static struct perf_pmu_events_attr event_attr_##v = { \ - .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \ - .id = 0, \ - .event_str = str, \ +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ }; struct rapl_pmu { @@ -411,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = { .attrs = rapl_pmu_attrs, }; -static ssize_t rapl_sysfs_show(struct device *dev, - struct device_attribute *attr, - char *page) -{ - struct perf_pmu_events_attr *pmu_attr = \ - container_of(attr, struct perf_pmu_events_attr, attr); - - if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); - - return 0; -} - RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 560e5255b15e..3bf41d413775 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; /* pci bus to socket mapping */ -int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, }; +DEFINE_RAW_SPINLOCK(pci2phy_map_lock); +struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; static DEFINE_RAW_SPINLOCK(uncore_box_lock); @@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +int uncore_pcibus_to_physid(struct pci_bus *bus) +{ + struct pci2phy_map *map; + int phys_id = -1; + + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == pci_domain_nr(bus)) { + phys_id = map->pbus_to_physid[bus->number]; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + return phys_id; +} + +struct pci2phy_map *__find_pci2phy_map(int segment) +{ + struct pci2phy_map *map, *alloc = NULL; + int i; + + lockdep_assert_held(&pci2phy_map_lock); + +lookup: + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == segment) + goto end; + } + + if (!alloc) { + raw_spin_unlock(&pci2phy_map_lock); + alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL); + raw_spin_lock(&pci2phy_map_lock); + + if (!alloc) + return NULL; + + goto lookup; + } + + map = alloc; + alloc = NULL; + map->segment = segment; + for (i = 0; i < 256; i++) + map->pbus_to_physid[i] = -1; + list_add_tail(&map->list, &pci2phy_map_head); + +end: + kfree(alloc); + return map; +} + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id int phys_id; bool first_box = false; - phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; @@ -830,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * each box has a different function id. */ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + /* Knights Landing uses a common PCI device ID for multiple instances of + * an uncore PMU device type. There is only one entry per device type in + * the knl_uncore_pci_ids table inspite of multiple devices present for + * some device types. Hence PCI device idx would be 0 for all devices. + * So increment pmu pointer to point to an unused array element. + */ + if (boot_cpu_data.x86_model == 87) + while (pmu->func_id >= 0) + pmu++; if (pmu->func_id < 0) pmu->func_id = pdev->devfn; else @@ -856,9 +919,10 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu; - int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + int i, cpu, phys_id; bool last_box = false; + phys_id = uncore_pcibus_to_physid(pdev->bus); box = pci_get_drvdata(pdev); if (!box) { for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { @@ -911,6 +975,7 @@ static int __init uncore_pci_init(void) case 63: /* Haswell-EP */ ret = hswep_uncore_pci_init(); break; + case 79: /* BDX-EP */ case 86: /* BDX-DE */ ret = bdx_uncore_pci_init(); break; @@ -927,6 +992,12 @@ static int __init uncore_pci_init(void) case 61: /* Broadwell */ ret = bdw_uncore_pci_init(); break; + case 87: /* Knights Landing */ + ret = knl_uncore_pci_init(); + break; + case 94: /* SkyLake */ + ret = skl_uncore_pci_init(); + break; default: return 0; } @@ -1232,9 +1303,13 @@ static int __init uncore_cpu_init(void) case 63: /* Haswell-EP */ hswep_uncore_cpu_init(); break; + case 79: /* BDX-EP */ case 86: /* BDX-DE */ bdx_uncore_cpu_init(); break; + case 87: /* Knights Landing */ + knl_uncore_cpu_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 72c54c2e5b1a..a7086b862156 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -117,6 +117,15 @@ struct uncore_event_desc { const char *config; }; +struct pci2phy_map { + struct list_head list; + int segment; + int pbus_to_physid[256]; +}; + +int uncore_pcibus_to_physid(struct pci_bus *bus); +struct pci2phy_map *__find_pci2phy_map(int segment); + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct pci_driver *uncore_pci_driver; -extern int uncore_pcibus_to_physid[256]; +extern raw_spinlock_t pci2phy_map_lock; +extern struct list_head pci2phy_map_head; extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; extern struct event_constraint uncore_constraint_empty; @@ -326,8 +336,10 @@ int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); int bdw_uncore_pci_init(void); +int skl_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); +int snb_pci2phy_map_init(int devid); /* perf_event_intel_uncore_snbep.c */ int snbep_uncore_pci_init(void); @@ -338,6 +350,8 @@ int hswep_uncore_pci_init(void); void hswep_uncore_cpu_init(void); int bdx_uncore_pci_init(void); void bdx_uncore_cpu_init(void); +int knl_uncore_pci_init(void); +void knl_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index f78574b3cb55..2bd030ddd0db 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -8,6 +8,7 @@ #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 +#define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -417,18 +418,28 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags) } } -static int snb_pci2phy_map_init(int devid) +int snb_pci2phy_map_init(int devid) { struct pci_dev *dev = NULL; - int bus; + struct pci2phy_map *map; + int bus, segment; dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); if (!dev) return -ENOTTY; bus = dev->bus->number; - - uncore_pcibus_to_physid[bus] = 0; + segment = pci_domain_nr(dev->bus); + + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + pci_dev_put(dev); + return -ENOMEM; + } + map->pbus_to_physid[bus] = 0; + raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); @@ -514,6 +525,14 @@ static const struct pci_device_id bdw_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id skl_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -534,6 +553,11 @@ static struct pci_driver bdw_uncore_pci_driver = { .id_table = bdw_uncore_pci_ids, }; +static struct pci_driver skl_uncore_pci_driver = { + .name = "skl_uncore", + .id_table = skl_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -548,6 +572,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ + IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ { /* end marker */ } }; @@ -600,6 +625,11 @@ int bdw_uncore_pci_init(void) return imc_uncore_pci_init(); } +int skl_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 694510a887dc..33acb884ccf1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -209,31 +209,98 @@ #define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710 #define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715 +/* KNL Ubox */ +#define KNL_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) +/* KNL CHA */ +#define KNL_CHA_MSR_OFFSET 0xc +#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16) +#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \ + KNL_CHA_MSR_PMON_CTL_QOR) +#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff +#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18) +#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32) + +/* KNL EDC/MC UCLK */ +#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400 +#define KNL_UCLK_MSR_PMON_CTL0 0x420 +#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430 +#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c +#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454 +#define KNL_PMON_FIXED_CTL_EN 0x1 + +/* KNL EDC */ +#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00 +#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20 +#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30 +#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c +#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44 + +/* KNL MC */ +#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00 +#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20 +#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30 +#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c +#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44 + +/* KNL IRP */ +#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0 +#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + KNL_CHA_MSR_PMON_CTL_QOR) +/* KNL PCU */ +#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f +#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7) +#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000 +#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \ + (KNL_PCU_PMON_CTL_EV_SEL_MASK | \ + KNL_PCU_PMON_CTL_USE_OCC_CTR | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_CBO_PMON_CTL_TID_EN | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PMON_CTL_INVERT | \ + KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29"); DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20"); +DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60"); DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62"); DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61"); DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63"); @@ -315,8 +382,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); - pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); + pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT); } static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) @@ -1087,7 +1155,8 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid; + int i, bus, nodeid, segment; + struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1106,16 +1175,27 @@ static int snbep_pci2phy_map_init(int devid) err = pci_read_config_dword(ubox_dev, 0x54, &config); if (err) break; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + /* * every three bits in the Node ID mapping register maps * to a particular node. */ for (i = 0; i < 8; i++) { if (nodeid == ((config >> (3 * i)) & 0x7)) { - uncore_pcibus_to_physid[bus] = i; + map->pbus_to_physid[bus] = i; break; } } + raw_spin_unlock(&pci2phy_map_lock); } if (!err) { @@ -1123,13 +1203,17 @@ static int snbep_pci2phy_map_init(int devid) * For PCI bus with no UBOX device, find the next bus * that has UBOX device and use its mapping. */ - i = -1; - for (bus = 255; bus >= 0; bus--) { - if (uncore_pcibus_to_physid[bus] >= 0) - i = uncore_pcibus_to_physid[bus]; - else - uncore_pcibus_to_physid[bus] = i; + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } } + raw_spin_unlock(&pci2phy_map_lock); } pci_dev_put(ubox_dev); @@ -1712,6 +1796,419 @@ int ivbep_uncore_pci_init(void) } /* end of IvyTown uncore support */ +/* KNL uncore support */ +static struct attribute *knl_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute_group knl_uncore_ubox_format_group = { + .name = "format", + .attrs = knl_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type knl_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &knl_uncore_ubox_format_group, +}; + +static struct attribute *knl_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_qor.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid4.attr, + &format_attr_filter_link3.attr, + &format_attr_filter_state4.attr, + &format_attr_filter_local.attr, + &format_attr_filter_all_op.attr, + &format_attr_filter_nnm.attr, + &format_attr_filter_opc3.attr, + &format_attr_filter_nc.attr, + &format_attr_filter_isoc.attr, + NULL, +}; + +static struct attribute_group knl_uncore_cha_format_group = { + .name = "format", + .attrs = knl_uncore_cha_formats_attr, +}; + +static struct event_constraint knl_uncore_cha_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1f, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg knl_uncore_cha_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4), + EVENT_EXTRA_END +}; + +static u64 knl_cha_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x4) + mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP; + return mask; +} + +static struct event_constraint * +knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask); +} + +static int knl_cha_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = knl_uncore_cha_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + + KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & knl_cha_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static void hswep_cbox_enable_event(struct intel_uncore_box *box, + struct perf_event *event); + +static struct intel_uncore_ops knl_uncore_cha_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = hswep_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = knl_cha_hw_config, + .get_constraint = knl_cha_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type knl_uncore_cha = { + .name = "cha", + .num_counters = 4, + .num_boxes = 38, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = KNL_CHA_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = knl_uncore_cha_constraints, + .ops = &knl_uncore_cha_ops, + .format_group = &knl_uncore_cha_format_group, +}; + +static struct attribute *knl_uncore_pcu_formats_attr[] = { + &format_attr_event2.attr, + &format_attr_use_occ_ctr.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh6.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge_det.attr, + NULL, +}; + +static struct attribute_group knl_uncore_pcu_format_group = { + .name = "format", + .attrs = knl_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_type knl_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, + .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, + .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &knl_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *knl_msr_uncores[] = { + &knl_uncore_ubox, + &knl_uncore_cha, + &knl_uncore_pcu, + NULL, +}; + +void knl_uncore_cpu_init(void) +{ + uncore_msr_uncores = knl_msr_uncores; +} + +static void knl_uncore_imc_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + pci_write_config_dword(pdev, box_ctl, 0); +} + +static void knl_uncore_imc_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK) + == UNCORE_FIXED_EVENT) + pci_write_config_dword(pdev, hwc->config_base, + hwc->config | KNL_PMON_FIXED_CTL_EN); + else + pci_write_config_dword(pdev, hwc->config_base, + hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops knl_uncore_imc_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = knl_uncore_imc_enable_box, + .read_counter = snbep_uncore_pci_read_counter, + .enable_event = knl_uncore_imc_enable_event, + .disable_event = snbep_uncore_pci_disable_event, +}; + +static struct intel_uncore_type knl_uncore_imc_uclk = { + .name = "imc_uclk", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW, + .event_ctl = KNL_UCLK_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW, + .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL, + .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL, + .ops = &knl_uncore_imc_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type knl_uncore_imc_dclk = { + .name = "imc", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW, + .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW, + .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL, + .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL, + .ops = &knl_uncore_imc_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type knl_uncore_edc_uclk = { + .name = "edc_uclk", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW, + .event_ctl = KNL_UCLK_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW, + .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL, + .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL, + .ops = &knl_uncore_imc_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type knl_uncore_edc_eclk = { + .name = "edc_eclk", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW, + .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW, + .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL, + .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL, + .ops = &knl_uncore_imc_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct event_constraint knl_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type knl_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .constraints = knl_uncore_m2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct attribute *knl_uncore_irp_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_qor.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group knl_uncore_irp_format_group = { + .name = "format", + .attrs = knl_uncore_irp_formats_attr, +}; + +static struct intel_uncore_type knl_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL, + .ops = &snbep_uncore_pci_ops, + .format_group = &knl_uncore_irp_format_group, +}; + +enum { + KNL_PCI_UNCORE_MC_UCLK, + KNL_PCI_UNCORE_MC_DCLK, + KNL_PCI_UNCORE_EDC_UCLK, + KNL_PCI_UNCORE_EDC_ECLK, + KNL_PCI_UNCORE_M2PCIE, + KNL_PCI_UNCORE_IRP, +}; + +static struct intel_uncore_type *knl_pci_uncores[] = { + [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk, + [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk, + [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk, + [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk, + [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie, + [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp, + NULL, +}; + +/* + * KNL uses a common PCI device ID for multiple instances of an Uncore PMU + * device type. prior to KNL, each instance of a PMU device type had a unique + * device ID. + * + * PCI Device ID Uncore PMU Devices + * ---------------------------------- + * 0x7841 MC0 UClk, MC1 UClk + * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2, + * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2 + * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk, + * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk + * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk, + * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk + * 0x7817 M2PCIe + * 0x7814 IRP +*/ + +static const struct pci_device_id knl_uncore_pci_ids[] = { + { /* MC UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0), + }, + { /* MC DClk Channel */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0), + }, + { /* EDC UClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0), + }, + { /* EDC EClk */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0), + }, + { /* M2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814), + .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver knl_uncore_pci_driver = { + .name = "knl_uncore", + .id_table = knl_uncore_pci_ids, +}; + +int knl_uncore_pci_init(void) +{ + int ret; + + /* All KNL PCI based PMON units are on the same PCI bus except IRP */ + ret = snb_pci2phy_map_init(0x7814); /* IRP */ + if (ret) + return ret; + ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */ + if (ret) + return ret; + uncore_pci_uncores = knl_pci_uncores; + uncore_pci_driver = &knl_uncore_pci_driver; + return 0; +} + +/* end of KNL uncore support */ + /* Haswell-EP uncore support */ static struct attribute *hswep_uncore_ubox_formats_attr[] = { &format_attr_event.attr, @@ -2322,7 +2819,7 @@ int hswep_uncore_pci_init(void) } /* end of Haswell-EP uncore support */ -/* BDX-DE uncore support */ +/* BDX uncore support */ static struct intel_uncore_type bdx_uncore_ubox = { .name = "ubox", @@ -2344,13 +2841,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x09, 0x3), UNCORE_EVENT_CONSTRAINT(0x11, 0x1), UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x3e, 0x1), EVENT_CONSTRAINT_END }; static struct intel_uncore_type bdx_uncore_cbox = { .name = "cbox", .num_counters = 4, - .num_boxes = 8, + .num_boxes = 24, .perf_ctr_bits = 48, .event_ctl = HSWEP_C0_MSR_PMON_CTL0, .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, @@ -2363,9 +2861,24 @@ static struct intel_uncore_type bdx_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +static struct intel_uncore_type bdx_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_S0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_SBOX_MSR_OFFSET, + .ops = &hswep_uncore_sbox_msr_ops, + .format_group = &hswep_uncore_sbox_format_group, +}; + static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, + &bdx_uncore_sbox, &hswep_uncore_pcu, NULL, }; @@ -2380,7 +2893,7 @@ void bdx_uncore_cpu_init(void) static struct intel_uncore_type bdx_uncore_ha = { .name = "ha", .num_counters = 4, - .num_boxes = 1, + .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), }; @@ -2388,7 +2901,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", .num_counters = 5, - .num_boxes = 2, + .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, @@ -2408,6 +2921,19 @@ static struct intel_uncore_type bdx_uncore_irp = { .format_group = &snbep_uncore_format_group, }; +static struct intel_uncore_type bdx_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .format_group = &snbep_uncore_qpi_format_group, +}; static struct event_constraint bdx_uncore_r2pcie_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x10, 0x3), @@ -2416,6 +2942,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x23, 0x1), UNCORE_EVENT_CONSTRAINT(0x25, 0x1), UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), EVENT_CONSTRAINT_END }; @@ -2429,26 +2957,77 @@ static struct intel_uncore_type bdx_uncore_r2pcie = { SNBEP_UNCORE_PCI_COMMON_INIT(), }; +static struct event_constraint bdx_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x7), + UNCORE_EVENT_CONSTRAINT(0x07, 0x7), + UNCORE_EVENT_CONSTRAINT(0x08, 0x7), + UNCORE_EVENT_CONSTRAINT(0x09, 0x7), + UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), + UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x15, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 3, + .perf_ctr_bits = 48, + .constraints = bdx_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + enum { BDX_PCI_UNCORE_HA, BDX_PCI_UNCORE_IMC, BDX_PCI_UNCORE_IRP, + BDX_PCI_UNCORE_QPI, BDX_PCI_UNCORE_R2PCIE, + BDX_PCI_UNCORE_R3QPI, }; static struct intel_uncore_type *bdx_pci_uncores[] = { [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha, [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc, [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp, + [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi, [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie, + [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi, NULL, }; -static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { +static const struct pci_device_id bdx_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), }, + { /* Home Agent 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1), + }, { /* MC0 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), @@ -2457,14 +3036,74 @@ static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), }, + { /* MC0 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2), + }, + { /* MC0 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5), + }, + { /* MC1 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6), + }, + { /* MC1 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7), + }, { /* IRP */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), }, + { /* QPI0 Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0), + }, + { /* QPI0 Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1), + }, + { /* QPI1 Port 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2), + }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), }, + { /* R3QPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1), + }, + { /* R3QPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0), + }, + { /* QPI Port 1 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1), + }, + { /* QPI Port 2 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), + }, { /* end: all zeroes */ } }; @@ -2484,4 +3123,4 @@ int bdx_uncore_pci_init(void) return 0; } -/* end of BDX-DE uncore support */ +/* end of BDX uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13934f2..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 136ac74dee82..819d94982e07 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -33,28 +33,27 @@ static int __init x86_rdrand_setup(char *s) __setup("nordrand", x86_rdrand_setup); /* - * Force a reseed cycle; we are architecturally guaranteed a reseed - * after no more than 512 128-bit chunks of random data. This also - * acts as a test of the CPU capability. + * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. + * Run the instruction a few times as a sanity check. + * If it fails, it is simple to disable RDRAND here. */ -#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) +#define SANITY_CHECK_LOOPS 8 void x86_init_rdrand(struct cpuinfo_x86 *c) { #ifdef CONFIG_ARCH_RANDOM unsigned long tmp; - int i, count, ok; + int i; if (!cpu_has(c, X86_FEATURE_RDRAND)) - return; /* Nothing to do */ + return; - for (count = i = 0; i < RESEED_LOOP; i++) { - ok = rdrand_long(&tmp); - if (ok) - count++; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (!rdrand_long(&tmp)) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + printk_once(KERN_WARNING "rdrand: disabled\n"); + return; + } } - - if (count != RESEED_LOOP) - clear_cpu_cap(c, X86_FEATURE_RDRAND); #endif } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 608fb26c7254..8cb57df9398d 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,32 +31,12 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, - { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, - { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, - { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, - { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 }, - { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, - { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, - { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, - { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, - { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, - { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, - { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, - { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, - { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 3fa0e5ad86b4..252da7aceca6 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -12,7 +12,7 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) xlvl = cpuid_eax(0x80860000); if ((xlvl & 0xffff0000) == 0x80860000) { if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); + c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } } @@ -82,7 +82,7 @@ static void init_transmeta(struct cpuinfo_x86 *c) /* Unhide possibly hidden capability flags */ rdmsr(0x80860004, cap_mask, uk); wrmsr(0x80860004, ~0, uk); - c->x86_capability[0] = cpuid_edx(0x00000001); + c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001); wrmsr(0x80860004, cap_mask, uk); /* All Transmeta CPUs have a constant TSC */ diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index bd3507da39f0..2836de390f95 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -58,28 +58,6 @@ static void cpuid_smp_cpuid(void *cmd_block) &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); } -static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret; - struct inode *inode = file->f_mapping->host; - - mutex_lock(&inode->i_mutex); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - break; - default: - ret = -EINVAL; - } - mutex_unlock(&inode->i_mutex); - return ret; -} - static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -132,7 +110,7 @@ static int cpuid_open(struct inode *inode, struct file *file) */ static const struct file_operations cpuid_fops = { .owner = THIS_MODULE, - .llseek = cpuid_seek, + .llseek = no_seek_end_llseek, .read = cpuid_read, .open = cpuid_open, }; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74ca2fe7a0b3..58f34319b29a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -35,6 +35,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> +#include <asm/intel_pt.h> /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 @@ -75,8 +76,6 @@ struct crash_memmap_data { unsigned int type; }; -int in_crash_kexec; - /* * This is used to VMCLEAR all VMCSs loaded on the * processor. And when loading kvm_intel module, the @@ -127,12 +126,16 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) cpu_emergency_vmxoff(); cpu_emergency_svm_disable(); + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + disable_local_APIC(); } static void kdump_nmi_shootdown_cpus(void) { - in_crash_kexec = 1; nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); @@ -172,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_vmxoff(); cpu_emergency_svm_disable(); + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + #ifdef CONFIG_X86_IO_APIC /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ ioapic_zap_locks(); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a102564d08eb..569c1e4f96fe 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -911,7 +911,7 @@ void __init finish_e820_parsing(void) } } -static inline const char *e820_type_to_string(int e820_type) +static const char *e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9f9cc682e561..bca14c899137 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -547,6 +547,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_CHV_IDS(&chv_stolen_funcs), INTEL_SKL_IDS(&gen9_stolen_funcs), INTEL_BXT_IDS(&gen9_stolen_funcs), + INTEL_KBL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) @@ -584,7 +585,7 @@ static void __init intel_graphics_stolen(int num, int slot, int func) static void __init force_disable_hpet(int num, int slot, int func) { #ifdef CONFIG_HPET_TIMER - boot_hpet_disable = 1; + boot_hpet_disable = true; pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); #endif } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index eec40f595ab9..21bf92490a7b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -195,14 +195,14 @@ static __init void early_serial_init(char *s) #ifdef CONFIG_PCI static void mem32_serial_out(unsigned long addr, int offset, int value) { - u32 *vaddr = (u32 *)addr; + u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ writel(value, vaddr + offset); } static unsigned int mem32_serial_in(unsigned long addr, int offset) { - u32 *vaddr = (u32 *)addr; + u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ return readl(vaddr + offset); } @@ -316,7 +316,7 @@ static struct console early_serial_console = { .index = -1, }; -static inline void early_console_register(struct console *con, int keep_early) +static void early_console_register(struct console *con, int keep_early) { if (con->index != -1) { printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d14e9ac3235a..6d9f0a7ef4c8 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -3,8 +3,11 @@ */ #include <asm/fpu/internal.h> #include <asm/tlbflush.h> +#include <asm/setup.h> +#include <asm/cmdline.h> #include <linux/sched.h> +#include <linux/init.h> /* * Initialize the TS bit in CR0 according to the style of context-switches @@ -12,7 +15,7 @@ */ static void fpu__init_cpu_ctx_switch(void) { - if (!cpu_has_eager_fpu) + if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) stts(); else clts(); @@ -143,9 +146,18 @@ static void __init fpu__init_system_generic(void) unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -/* Enforce that 'MEMBER' is the last field of 'TYPE': */ +/* Get alignment of the TYPE. */ +#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) + +/* + * Enforce that 'MEMBER' is the last field of 'TYPE'. + * + * Align the computed size with alignment of the TYPE, + * because that's how C aligns structs. + */ #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) + BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ + TYPE_ALIGN(TYPE))) /* * We append the 'struct fpu' to the task_struct: @@ -188,7 +200,7 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu = 1; + static int on_boot_cpu __initdata = 1; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -261,24 +273,56 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -static int __init eager_fpu_setup(char *s) +/* + * Find supported xfeatures based on cpu features and command-line input. + * This must be called after fpu__init_parse_early_param() is called and + * xfeatures_mask is enumerated. + */ +u64 __init fpu__get_supported_xfeatures_mask(void) { - if (!strcmp(s, "on")) - eagerfpu = ENABLE; - else if (!strcmp(s, "off")) - eagerfpu = DISABLE; - else if (!strcmp(s, "auto")) - eagerfpu = AUTO; - return 1; + /* Support all xfeatures known to us */ + if (eagerfpu != DISABLE) + return XCNTXT_MASK; + + /* Warning of xfeatures being disabled for no eagerfpu mode */ + if (xfeatures_mask & XFEATURE_MASK_EAGER) { + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + xfeatures_mask & XFEATURE_MASK_EAGER); + } + + /* Return a mask that masks out all features requiring eagerfpu mode */ + return ~XFEATURE_MASK_EAGER; +} + +/* + * Disable features dependent on eagerfpu. + */ +static void __init fpu__clear_eager_fpu_features(void) +{ + setup_clear_cpu_cap(X86_FEATURE_MPX); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); } -__setup("eagerfpu=", eager_fpu_setup); /* * Pick the FPU context switching strategy: + * + * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of + * the following is true: + * + * (1) the cpu has xsaveopt, as it has the optimization and doing eager + * FPU switching has a relatively low cost compared to a plain xsave; + * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU + * switching. Should the kernel boot with noxsaveopt, we support MPX + * with eager FPU switching at a higher cost. */ static void __init fpu__init_system_ctx_switch(void) { - static bool on_boot_cpu = 1; + static bool on_boot_cpu __initdata = 1; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -286,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; - /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) + if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; - if (xfeatures_mask & XSTATE_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XSTATE_EAGER); - xfeatures_mask &= ~XSTATE_EAGER; - } else { - eagerfpu = ENABLE; - } - } + if (xfeatures_mask & XFEATURE_MASK_EAGER) + eagerfpu = ENABLE; if (eagerfpu == ENABLE) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); @@ -307,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void) } /* + * We parse fpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init fpu__init_parse_early_param(void) +{ + /* + * No need to check "eagerfpu=auto" again, since it is the + * initial default. + */ + if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { + eagerfpu = DISABLE; + fpu__clear_eager_fpu_features(); + } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { + eagerfpu = ENABLE; + } + + if (cmdline_find_option_bool(boot_command_line, "no387")) + setup_clear_cpu_cap(X86_FEATURE_FPU); + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) { + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); + setup_clear_cpu_cap(X86_FEATURE_XMM); + } + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + fpu__xstate_clear_all_cpu_caps(); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); +} + +/* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpu__init_parse_early_param(); fpu__init_system_early_generic(c); /* @@ -335,72 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_ctx_switch(); } - -/* - * Boot parameter to turn off FPU support and fall back to math-emu: - */ -static int __init no_387(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FPU); - return 1; -} -__setup("no387", no_387); - -/* - * Disable all xstate CPU features: - */ -static int __init x86_noxsave_setup(char *s) -{ - if (strlen(s)) - return 0; - - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_MPX); - - return 1; -} -__setup("noxsave", x86_noxsave_setup); - -/* - * Disable the XSAVEOPT instruction specifically: - */ -static int __init x86_noxsaveopt_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - return 1; -} -__setup("noxsaveopt", x86_noxsaveopt_setup); - -/* - * Disable the XSAVES instruction: - */ -static int __init x86_noxsaves_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - return 1; -} -__setup("noxsaves", x86_noxsaves_setup); - -/* - * Disable FX save/restore and SSE support: - */ -static int __init x86_nofxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); - setup_clear_cpu_cap(X86_FEATURE_XMM); - - return 1; -} -__setup("nofxsr", x86_nofxsr_setup); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index dc60810c1c74..0bc3490420c5 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -66,7 +66,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP and SSE state. */ if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return ret; } @@ -326,7 +326,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FP; + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; return ret; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 50ec9af1bd51..31c6a60505e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -56,7 +56,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (use_fxsr()) { struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; + struct _fpstate_32 __user *fp = buf; convert_from_fxsr(&env, tsk); @@ -107,7 +107,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XSTATE_FPSSE; + xfeatures |= XFEATURE_MASK_FPSSE; err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); @@ -165,7 +165,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + (struct _fpstate_32 __user *) buf) ? -1 : 1; if (fpregs_active()) { /* Save the live register state to the user directly. */ @@ -207,7 +207,7 @@ sanitize_restored_xstate(struct task_struct *tsk, * layout and not enabled by the OS. */ if (fx_only) - header->xfeatures = XSTATE_FPSSE; + header->xfeatures = XFEATURE_MASK_FPSSE; else header->xfeatures &= (xfeatures_mask & xfeatures); } @@ -230,7 +230,7 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_ { if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); } else { @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 62fc001c7846..d425cda5ae6d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -31,12 +31,29 @@ static const char *xfeature_names[] = */ u64 xfeatures_mask __read_mostly; -static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; -static unsigned int xstate_sizes[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; +static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; -/* The number of supported xfeatures in xfeatures_mask: */ -static unsigned int xfeatures_nr; +/* + * Clear all of the X86_FEATURE_* bits that are unavailable + * when the CPU has no XSAVE support. + */ +void fpu__xstate_clear_all_cpu_caps(void) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVEC); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_MPX); + setup_clear_cpu_cap(X86_FEATURE_XGETBV1); +} /* * Return whether the system supports a given xfeature. @@ -53,7 +70,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver - * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the + * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ @@ -112,7 +129,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) /* * FP is in init state */ - if (!(xfeatures & XSTATE_FP)) { + if (!(xfeatures & XFEATURE_MASK_FP)) { fx->cwd = 0x37f; fx->swd = 0; fx->twd = 0; @@ -125,7 +142,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) /* * SSE is in init state */ - if (!(xfeatures & XSTATE_SSE)) + if (!(xfeatures & XFEATURE_MASK_SSE)) memset(&fx->xmm_space[0], 0, 256); /* @@ -169,25 +186,43 @@ void fpu__init_cpu_xstate(void) } /* + * Note that in the future we will likely need a pair of + * functions here: one for user xstates and the other for + * system xstates. For now, they are the same. + */ +static int xfeature_enabled(enum xfeature xfeature) +{ + return !!(xfeatures_mask & (1UL << xfeature)); +} + +/* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. - * - * ( Note that certain features might be non-present, for them - * we'll have 0 offset and 0 size. ) */ static void __init setup_xstate_features(void) { - u32 eax, ebx, ecx, edx, leaf; - - xfeatures_nr = fls64(xfeatures_mask); - - for (leaf = 2; leaf < xfeatures_nr; leaf++) { - cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); - - xstate_offsets[leaf] = ebx; - xstate_sizes[leaf] = eax; + u32 eax, ebx, ecx, edx, i; + /* start at the beginnning of the "extended state" */ + unsigned int last_good_offset = offsetof(struct xregs_state, + extended_state_area); + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; + /* + * In our xstate size checks, we assume that the + * highest-numbered xstate feature has the + * highest offset in the buffer. Ensure it does. + */ + WARN_ONCE(last_good_offset > xstate_offsets[i], + "x86/fpu: misordered xstate at %d\n", last_good_offset); + last_good_offset = xstate_offsets[i]; - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax); + printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -204,14 +239,14 @@ static void __init print_xstate_feature(u64 xstate_mask) */ static void __init print_xstate_features(void) { - print_xstate_feature(XSTATE_FP); - print_xstate_feature(XSTATE_SSE); - print_xstate_feature(XSTATE_YMM); - print_xstate_feature(XSTATE_BNDREGS); - print_xstate_feature(XSTATE_BNDCSR); - print_xstate_feature(XSTATE_OPMASK); - print_xstate_feature(XSTATE_ZMM_Hi256); - print_xstate_feature(XSTATE_Hi16_ZMM); + print_xstate_feature(XFEATURE_MASK_FP); + print_xstate_feature(XFEATURE_MASK_SSE); + print_xstate_feature(XFEATURE_MASK_YMM); + print_xstate_feature(XFEATURE_MASK_BNDREGS); + print_xstate_feature(XFEATURE_MASK_BNDCSR); + print_xstate_feature(XFEATURE_MASK_OPMASK); + print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); + print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); } /* @@ -233,8 +268,8 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); if (!cpu_has_xsaves) { - for (i = 2; i < xfeatures_nr; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (xfeature_enabled(i)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; } @@ -242,15 +277,16 @@ static void __init setup_xstate_comp(void) return; } - xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = + FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < xfeatures_nr; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (xfeature_enabled(i)) xstate_comp_sizes[i] = xstate_sizes[i]; else xstate_comp_sizes[i] = 0; - if (i > 2) + if (i > FIRST_EXTENDED_XFEATURE) xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; @@ -262,7 +298,7 @@ static void __init setup_xstate_comp(void) */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu = 1; + static int on_boot_cpu __initdata = 1; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -290,27 +326,280 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component i is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + u32 eax, ebx, ecx, edx; + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; + return !!(ecx & 1); + */ + return 0; +} +/* +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} +*/ + +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component i when the compacted format + * of the extended region of an XSAVE area is used + */ + return !!(ecx & 2); +} + +static int xfeature_uncompacted_offset(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return ebx; +} + +static int xfeature_size(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return eax; +} + +/* + * 'XSAVES' implies two different things: + * 1. saving of supervisor/system state + * 2. using the compacted format + * + * Use this function when dealing with the compacted format so + * that it is obvious which aspect of 'XSAVES' is being handled + * by the calling code. + */ +static int using_compacted_format(void) +{ + return cpu_has_xsaves; +} + +static void __xstate_dump_leaves(void) +{ + int i; + u32 eax, ebx, ecx, edx; + static int should_dump = 1; + + if (!should_dump) + return; + should_dump = 0; + /* + * Dump out a few leaves past the ones that we support + * just in case there are some goodies up there + */ + for (i = 0; i < XFEATURE_MAX + 10; i++) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", + XSTATE_CPUID, i, eax, ebx, ecx, edx); + } +} + +#define XSTATE_WARN_ON(x) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ + if ((nr == nr_macro) && \ + WARN_ONCE(sz != sizeof(__struct), \ + "%s: struct is %zu bytes, cpu state %d bytes\n", \ + __stringify(nr_macro), sizeof(__struct), sz)) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +/* + * We have a C struct for each 'xstate'. We need to ensure + * that our software representation matches what the CPU + * tells us about the state's size. + */ +static void check_xstate_against_struct(int nr) +{ + /* + * Ask the CPU for the size of the state. + */ + int sz = xfeature_size(nr); + /* + * Match each CPU state with the corresponding software + * structure. + */ + XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); + XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); + XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); + XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); + XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); + XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); + + /* + * Make *SURE* to add any feature numbers in below if + * there are "holes" in the xsave state component + * numbers. + */ + if ((nr < XFEATURE_YMM) || + (nr >= XFEATURE_MAX)) { + WARN_ONCE(1, "no structure for xstate: %d\n", nr); + XSTATE_WARN_ON(1); + } +} + +/* + * This essentially double-checks what the cpu told us about + * how large the XSAVE buffer needs to be. We are recalculating + * it to be safe. + */ +static void do_extra_xstate_size_checks(void) +{ + int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + + check_xstate_against_struct(i); + /* + * Supervisor state components can be managed only by + * XSAVES, which is compacted-format only. + */ + if (!using_compacted_format()) + XSTATE_WARN_ON(xfeature_is_supervisor(i)); + + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); + /* + * The offset of a given state in the non-compacted + * format is given to us in a CPUID leaf. We check + * them for being ordered (increasing offsets) in + * setup_xstate_features(). + */ + if (!using_compacted_format()) + paranoid_xstate_size = xfeature_uncompacted_offset(i); + /* + * The compacted-format offset always depends on where + * the previous state ended. + */ + paranoid_xstate_size += xfeature_size(i); + } + XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); +} + /* * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * + * Note the SDM's wording here. "sub-function 0" only enumerates + * the size of the *user* states. If we use it to size a buffer + * that we use 'XSAVES' on, we could potentially overflow the + * buffer because 'XSAVES' saves system states too. + * + * Note that we do not currently set any bits on IA32_XSS so + * 'XCR0 | IA32_XSS == XCR0' for now. */ -static void __init init_xstate_size(void) +static unsigned int __init calculate_xstate_size(void) { unsigned int eax, ebx, ecx, edx; - int i; + unsigned int calculated_xstate_size; if (!cpu_has_xsaves) { + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; + calculated_xstate_size = ebx; + } else { + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + calculated_xstate_size = ebx; } + return calculated_xstate_size; +} - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; - } - } +/* + * Will the runtime-enumerated 'xstate_size' fit in the init + * task's statically-allocated buffer? + */ +static bool is_supported_xstate_size(unsigned int test_xstate_size) +{ + if (test_xstate_size <= sizeof(union fpregs_state)) + return true; + + pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", + sizeof(union fpregs_state), test_xstate_size); + return false; +} + +static int init_xstate_size(void) +{ + /* Recompute the context size for enabled features: */ + unsigned int possible_xstate_size = calculate_xstate_size(); + + /* Ensure we have the space to store all enabled: */ + if (!is_supported_xstate_size(possible_xstate_size)) + return -EINVAL; + + /* + * The size is OK, we are definitely going to use xsave, + * make it known to the world that we need more space. + */ + xstate_size = possible_xstate_size; + do_extra_xstate_size_checks(); + return 0; +} + +/* + * We enabled the XSAVE hardware, but something went wrong and + * we can not use it. Disable it. + */ +static void fpu__init_disable_system_xstate(void) +{ + xfeatures_mask = 0; + cr4_clear_bits(X86_CR4_OSXSAVE); + fpu__xstate_clear_all_cpu_caps(); } /* @@ -320,7 +609,8 @@ static void __init init_xstate_size(void) void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu = 1; + static int on_boot_cpu __initdata = 1; + int err; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -338,26 +628,28 @@ void __init fpu__init_system_xstate(void) cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xfeatures_mask = eax + ((u64)edx << 32); - if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); BUG(); } - /* Support only the state known to the OS: */ - xfeatures_mask = xfeatures_mask & XCNTXT_MASK; + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); - - /* Recompute the context size for enabled features: */ - init_xstate_size(); + err = init_xstate_size(); + if (err) { + /* something went wrong, boot without any XSAVE support */ + fpu__init_disable_system_xstate(); + return; + } update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); - pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, xstate_size, cpu_has_xsaves ? "compacted" : "standard"); @@ -388,7 +680,7 @@ void fpu__resume_cpu(void) * Inputs: * xstate: the thread's storage area for all FPU data * xstate_feature: state which is defined in xsave.h (e.g. - * XSTATE_FP, XSTATE_SSE, etc...) + * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. @@ -402,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is @@ -439,8 +730,8 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * Note that this only works on the current task. * * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP, - * XSTATE_SSE, etc...) + * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, + * XFEATURE_MASK_SSE, etc...) * Output: * address of the state in the xsave area or NULL if the state * is not present or is in its 'init state'. diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8b7b0a51e742..29408d6d6626 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -105,14 +105,14 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, { unsigned char replaced[MCOUNT_INSN_SIZE]; + ftrace_expected = old_code; + /* - * Note: Due to modules and __init, code can - * disappear and change, we need to protect against faulting - * as well as code changing. We do this by using the - * probe_kernel_* functions. - * - * No real locking needed, this code is run through - * kstop_machine, or before SMP starts. + * Note: + * We are paranoid about modifying text, as if a bug was to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with probe_kernel_*(), and make + * sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ @@ -154,6 +154,8 @@ int ftrace_make_nop(struct module *mod, if (addr == MCOUNT_ADDR) return ftrace_modify_code_direct(rec->ip, old, new); + ftrace_expected = NULL; + /* Normal cases use add_brk_on_nop */ WARN_ONCE(1, "invalid use of ftrace_make_nop"); return -EINVAL; @@ -220,6 +222,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { WARN_ON(1); + ftrace_expected = NULL; return -EINVAL; } @@ -314,6 +317,8 @@ static int add_break(unsigned long ip, const char *old) if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; + ftrace_expected = old; + /* Make sure it is what we expect it to be */ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) return -EINVAL; @@ -413,6 +418,8 @@ static int remove_breakpoint(struct dyn_ftrace *rec) ftrace_addr = ftrace_get_addr_curr(rec); nop = ftrace_call_replace(ip, ftrace_addr); + ftrace_expected = nop; + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) return -EINVAL; } @@ -556,6 +563,7 @@ void ftrace_replace_code(int enable) run_sync(); report = "updating code"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -563,11 +571,13 @@ void ftrace_replace_code(int enable) ret = add_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); report = "removing breakpoints"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -575,6 +585,7 @@ void ftrace_replace_code(int enable) ret = finish_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f129a9af6357..2c0f3407bd1f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -192,5 +192,13 @@ void __init x86_64_start_reservations(char *real_mode_data) reserve_ebda_region(); + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + default: + break; + } + start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0e2d96ffd158..6bc9ae24b6d2 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -152,7 +152,7 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_MICROCODE_EARLY +#ifdef CONFIG_MICROCODE /* Early load ucode on BSP. */ call load_ucode_bsp #endif @@ -311,12 +311,11 @@ ENTRY(startup_32_smp) movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp -#ifdef CONFIG_MICROCODE_EARLY +#ifdef CONFIG_MICROCODE /* Early load ucode on AP. */ call load_ucode_ap #endif - default_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8a73f2..ffdc0e860390 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,9 @@ startup_64: * tables and then reload them. */ + /* Sanitize CPU configuration */ + call verify_cpu + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -174,6 +177,9 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + /* Sanitize CPU configuration */ + call verify_cpu + movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: @@ -288,6 +294,8 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#include "verify_cpu.S" + #ifdef CONFIG_HOTPLUG_CPU /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 88b4da373081..b8e6ff5cd5d0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -37,10 +37,10 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ -u8 hpet_msi_disable; +bool hpet_msi_disable; #ifdef CONFIG_PCI_MSI -static unsigned long hpet_num_timers; +static unsigned int hpet_num_timers; #endif static void __iomem *hpet_virt_address; @@ -86,9 +86,9 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -int boot_hpet_disable; -int hpet_force_user; -static int hpet_verbose; +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; static int __init hpet_setup(char *str) { @@ -98,11 +98,11 @@ static int __init hpet_setup(char *str) if (next) *next++ = 0; if (!strncmp("disable", str, 7)) - boot_hpet_disable = 1; + boot_hpet_disable = true; if (!strncmp("force", str, 5)) - hpet_force_user = 1; + hpet_force_user = true; if (!strncmp("verbose", str, 7)) - hpet_verbose = 1; + hpet_verbose = true; str = next; } return 1; @@ -111,7 +111,7 @@ __setup("hpet=", hpet_setup); static int __init disable_hpet(char *str) { - boot_hpet_disable = 1; + boot_hpet_disable = true; return 1; } __setup("nohpet", disable_hpet); @@ -124,7 +124,7 @@ static inline int is_hpet_capable(void) /* * HPET timer interrupt enable / disable */ -static int hpet_legacy_int_enabled; +static bool hpet_legacy_int_enabled; /** * is_hpet_enabled - check whether the hpet timer interrupt is enabled @@ -230,7 +230,7 @@ static struct clock_event_device hpet_clockevent; static void hpet_stop_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + u32 cfg = hpet_readl(HPET_CFG); cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -272,7 +272,7 @@ static void hpet_enable_legacy_int(void) cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); - hpet_legacy_int_enabled = 1; + hpet_legacy_int_enabled = true; } static void hpet_legacy_clockevent_register(void) @@ -983,7 +983,7 @@ void hpet_disable(void) cfg = *hpet_boot_cfg; else if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; - hpet_legacy_int_enabled = 0; + hpet_legacy_int_enabled = false; } cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); @@ -1121,8 +1121,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); static void hpet_disable_rtc_channel(void) { - unsigned long cfg; - cfg = hpet_readl(HPET_T1_CFG); + u32 cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T1_CFG); } diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 50a3fad5b89f..2bcfb5f2bc44 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -300,6 +300,10 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) return -EINVAL; + + if (!boot_cpu_has(X86_FEATURE_BPEXT)) + return -EOPNOTSUPP; + /* * It's impossible to use a range breakpoint to fake out * user vs kernel detection because bp_len - 1 can't @@ -307,8 +311,6 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - if (!cpu_has_bpext) - return -EOPNOTSUPP; info->mask = bp->attr.bp_len - 1; info->len = X86_BREAKPOINT_LEN_1; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 16cb827a5b27..be22f5a2192e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -295,16 +295,11 @@ static void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void init_8259A(int auto_eoi) +static int probe_8259A(void) { unsigned long flags; unsigned char probe_val = ~(1 << PIC_CASCADE_IR); unsigned char new_val; - - i8259A_auto_eoi = auto_eoi; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - /* * Check to see if we have a PIC. * Mask all except the cascade and read @@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi) * have a PIC, we will read 0xff as opposed to the * value we wrote. */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return; } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* @@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } +static int legacy_pic_probe(void) +{ + return 0; +} struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, @@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = { .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; @@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = { .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, + .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f8062aaf5df9..61521dc19c10 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -462,7 +462,7 @@ void fixup_irqs(void) * non intr-remapping case, we can't wait till this interrupt * arrives at this cpu before completing the irq move. */ - irq_force_complete_move(irq); + irq_force_complete_move(desc); if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; @@ -470,6 +470,15 @@ void fixup_irqs(void) } chip = irq_data_get_irq_chip(data); + /* + * The interrupt descriptor might have been cleaned up + * already, but it is not yet removed from the radix tree + */ + if (!chip) { + raw_spin_unlock(&desc->lock); + continue; + } + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) chip->irq_mask(data); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c767cf2bc80a..206d0b90a3ab 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -72,7 +72,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { stack_overflow_check(regs); - if (unlikely(IS_ERR_OR_NULL(desc))) + if (IS_ERR_OR_NULL(desc)) return false; generic_handle_irq_desc(desc); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index d6178d9791db..44256a62702b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -511,26 +511,31 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) return NOTIFY_STOP; } -static int was_in_debug_nmi[NR_CPUS]; +static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS); static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + int cpu; + switch (cmd) { case NMI_LOCAL: if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - was_in_debug_nmi[raw_smp_processor_id()] = 1; + cpu = raw_smp_processor_id(); + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); touch_nmi_watchdog(); + return NMI_HANDLED; } break; case NMI_UNKNOWN: - if (was_in_debug_nmi[raw_smp_processor_id()]) { - was_in_debug_nmi[raw_smp_processor_id()] = 0; + cpu = raw_smp_processor_id(); + + if (__test_and_clear_bit(cpu, was_in_debug_nmi)) return NMI_HANDLED; - } + break; default: /* do nothing */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..72cef58693c7 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -44,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock); static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; +struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return hv_clock; +} + /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -92,6 +98,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +277,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,23 +304,12 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 int cpu; - int ret; u8 flags; struct pvclock_vcpu_time_info *vcpu_time; unsigned int size; @@ -301,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } - if ((ret = pvclock_init_vsyscall(hv_clock, size))) { - put_cpu(); - return ret; - } - put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index ff3c3101d003..92fc1a51f994 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -20,8 +20,6 @@ #include <linux/module.h> #include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/page_types.h> #include <asm/elf.h> #include <asm/livepatch.h> @@ -38,12 +36,10 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value) { - int ret, numpages, size = 4; - bool readonly; + size_t size = 4; unsigned long val; - unsigned long core = (unsigned long)mod->module_core; - unsigned long core_ro_size = mod->core_ro_size; - unsigned long core_size = mod->core_size; + unsigned long core = (unsigned long)mod->core_layout.base; + unsigned long core_size = mod->core_layout.size; switch (type) { case R_X86_64_NONE: @@ -70,21 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, /* loc does not point to any symbol inside the module */ return -EINVAL; - if (loc < core + core_ro_size) - readonly = true; - else - readonly = false; - - /* determine if the relocation spans a page boundary */ - numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; - - if (readonly) - set_memory_rw(loc & PAGE_MASK, numpages); - - ret = probe_kernel_write((void *)loc, &val, size); - - if (readonly) - set_memory_ro(loc & PAGE_MASK, numpages); - - return ret; + return probe_kernel_write((void *)loc, &val, size); } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 819ab3f9c9c7..ba7fbba9831b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +#ifdef CONFIG_KEXEC_VERIFY_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, unsigned long kernel_len) { @@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, return image->fops->verify_sig(kernel, kernel_len); } +#endif /* * Apply purgatory relocations. diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 94ea120fa21f..87e1762e2bca 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -278,6 +278,12 @@ trace: /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ call *ftrace_trace_function restore_mcount_regs diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 113e70784854..64f9616f93f1 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -45,28 +45,6 @@ static struct class *msr_class; -static loff_t msr_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret; - struct inode *inode = file_inode(file); - - mutex_lock(&inode->i_mutex); - switch (orig) { - case SEEK_SET: - file->f_pos = offset; - ret = file->f_pos; - break; - case SEEK_CUR: - file->f_pos += offset; - ret = file->f_pos; - break; - default: - ret = -EINVAL; - } - mutex_unlock(&inode->i_mutex); - return ret; -} - static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -194,7 +172,7 @@ static int msr_open(struct inode *inode, struct file *file) */ static const struct file_operations msr_fops = { .owner = THIS_MODULE, - .llseek = msr_seek, + .llseek = no_seek_end_llseek, .read = msr_read, .write = msr_write, .open = msr_open, diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 697f90db0e37..8a2cdd736fa4 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -29,6 +29,7 @@ #include <asm/mach_traps.h> #include <asm/nmi.h> #include <asm/x86_init.h> +#include <asm/reboot.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -231,7 +232,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -255,8 +256,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); show_regs(regs); - if (panic_on_io_nmi) - panic("NMI IOCK error: Not continuing"); + if (panic_on_io_nmi) { + nmi_panic(regs, "NMI IOCK error: Not continuing"); + + /* + * If we end up here, it means we have received an NMI while + * processing panic(). Simply return without delaying and + * re-enabling NMIs. + */ + return; + } /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; @@ -297,7 +306,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } @@ -348,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs) return; } - /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ - raw_spin_lock(&nmi_reason_lock); + /* + * Non-CPU-specific NMI: NMI sources can be processed on any CPU. + * + * Another CPU may be processing panic routines while holding + * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, + * and if so, call its callback directly. If there is no CPU preparing + * crash dump, we simply loop here. + */ + while (!raw_spin_trylock(&nmi_reason_lock)) { + run_crash_ipi_callback(regs); + cpu_relax(); + } + reason = x86_platform.get_nmi_reason(); if (reason & NMI_REASON_MASK) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c2130aef3f9d..f08ac28b8136 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -74,16 +74,6 @@ void __init default_banner(void) /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; -unsigned paravirt_patch_nop(void) -{ - return 0; -} - -unsigned paravirt_patch_ignore(unsigned len) -{ - return len; -} - struct branch { unsigned char opcode; u32 delta; @@ -133,7 +123,6 @@ static void *get_call_destination(u8 type) .pv_time_ops = pv_time_ops, .pv_cpu_ops = pv_cpu_ops, .pv_irq_ops = pv_irq_ops, - .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, #ifdef CONFIG_PARAVIRT_SPINLOCKS .pv_lock_ops = pv_lock_ops, @@ -152,8 +141,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If there's no function, patch it with a ud2a (BUG) */ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == _paravirt_nop) - /* If the operation is a nop, then nop the callsite */ - ret = paravirt_patch_nop(); + ret = 0; /* identity functions just return their single argument */ else if (opfunc == _paravirt_ident_32) @@ -162,10 +150,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || -#ifdef CONFIG_X86_32 - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || -#endif - type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); @@ -220,8 +204,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); -extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { @@ -379,13 +361,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) - .irq_enable_sysexit = native_irq_enable_sysexit, -#endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION - .usergs_sysret32 = native_usergs_sysret32, -#endif .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, @@ -403,12 +379,6 @@ NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); -struct pv_apic_ops pv_apic_ops = { -#ifdef CONFIG_X86_LOCAL_APIC - .startup_ipi_hook = paravirt_nop, -#endif -}; - #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) /* 32-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) @@ -444,9 +414,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pmd = native_set_pmd, .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - .pmd_update = paravirt_nop, - .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, @@ -492,6 +459,5 @@ struct pv_mmu_ops pv_mmu_ops = { EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); -EXPORT_SYMBOL_GPL(pv_apic_ops); EXPORT_SYMBOL_GPL(pv_info); EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index c89f50a76e97..158dc0650d5d 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); @@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 8aa05583bc42..e70087a04cc8 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -13,9 +13,7 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); -DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); @@ -55,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_mmu_ops, read_cr2); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 0497f719977d..833b1d329c47 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -180,13 +180,13 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl); static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); static void get_tce_space_from_tar(void); -static struct cal_chipset_ops calgary_chip_ops = { +static const struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, .tce_cache_blast = calgary_tce_cache_blast, .dump_error_regs = calgary_dump_error_regs }; -static struct cal_chipset_ops calioc2_chip_ops = { +static const struct cal_chipset_ops calioc2_chip_ops = { .handle_quirks = calioc2_handle_quirks, .tce_cache_blast = calioc2_tce_cache_blast, .dump_error_regs = calioc2_dump_error_regs diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index adf0392d549a..7c577a178859 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -88,7 +88,7 @@ int __init pci_swiotlb_detect_4gb(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!no_iommu && max_pfn > MAX_DMA32_PFN) + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif return swiotlb; diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 4f00b63d7ff3..14415aff1813 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -4,10 +4,22 @@ */ #include <linux/platform_device.h> #include <linux/module.h> +#include <linux/ioport.h> + +static int found(u64 start, u64 end, void *data) +{ + return 1; +} static __init int register_e820_pmem(void) { + char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; + int rc; + + rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; /* * See drivers/nvdimm/e820.c for the implementation, this is diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 737527b40e5b..9f950917528b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) set_iopl_mask(next->iopl); /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - - /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b35921a670b2..b9d99e0f82c4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -125,7 +125,7 @@ void release_thread(struct task_struct *dead_task) if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, - dead_task->mm->context.ldt, + dead_task->mm->context.ldt->entries, dead_task->mm->context.ldt->size); BUG(); } @@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. * - * These are even more complicated than FS and GS: they have + * These are even more complicated than DS and ES: they have * 64-bit bases are that controlled by arch_prctl. Those bases * only differ from the values in the GDT or LDT if the selector * is 0. @@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 558f50edebca..32e9d9cbb884 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -124,21 +124,6 @@ const char *regs_query_register_name(unsigned int offset) return NULL; } -static const int arg_offs_table[] = { -#ifdef CONFIG_X86_32 - [0] = offsetof(struct pt_regs, ax), - [1] = offsetof(struct pt_regs, dx), - [2] = offsetof(struct pt_regs, cx) -#else /* CONFIG_X86_64 */ - [0] = offsetof(struct pt_regs, di), - [1] = offsetof(struct pt_regs, si), - [2] = offsetof(struct pt_regs, dx), - [3] = offsetof(struct pt_regs, cx), - [4] = offsetof(struct pt_regs, r8), - [5] = offsetof(struct pt_regs, r9) -#endif -}; - /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..99bfc025111d 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } - -#ifdef CONFIG_X86_64 -/* - * Initialize the generic pvclock vsyscall state. This will allocate - * a/some page(s) for the per-vcpu pvclock information, set up a - * fixmap mapping for the page(s) - */ - -int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, - int size) -{ - int idx; - - WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { - __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, - __pa(i) + (idx*PAGE_SIZE), - PAGE_KERNEL_VVAR); - } - - return 0; -} -#endif diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 176a0f99d4da..cc457ff818ad 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -524,7 +524,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, */ static void force_disable_hpet_msi(struct pci_dev *unused) { - hpet_msi_disable = 1; + hpet_msi_disable = true; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 02693dd9a079..ab0adc0fa5db 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, /* ASRock */ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ @@ -718,6 +726,7 @@ static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { @@ -780,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) smp_send_nmi_allbutself(); + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); @@ -788,9 +800,35 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } + +/* + * Check if the crash dumping IPI got issued and if so, call its callback + * directly. This function is used when we have already been in NMI handler. + * It doesn't return. + */ +void run_crash_ipi_callback(struct pt_regs *regs) +{ + if (crash_ipi_issued) + crash_nmi_callback(0, regs); +} + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* If no CPU is preparing crash dump, we simply loop here. */ + run_crash_ipi_callback(regs); + cpu_relax(); + } +} + #else /* !CONFIG_SMP */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { /* No other CPUs to shoot down */ } + +void run_crash_ipi_callback(struct pt_regs *regs) +{ +} #endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index cd9685235df9..4af8d063fb36 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void) } #endif + if (paravirt_enabled() && !paravirt_has(RTC)) + return -ENODEV; + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a3cccbfc5f77..d3d80e6d42a2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -111,6 +111,7 @@ #include <asm/mce.h> #include <asm/alternative.h> #include <asm/prom.h> +#include <asm/microcode.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -480,34 +481,34 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC_CORE +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN (16 << 20) + /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) -# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) +# define CRASH_ADDR_LOW_MAX (512 << 20) +# define CRASH_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) -# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM +# define CRASH_ADDR_LOW_MAX (896UL << 20) +# define CRASH_ADDR_HIGH_MAX MAXMEM #endif -static void __init reserve_crashkernel_low(void) +static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long low_base = 0, low_size = 0; + unsigned long long base, low_base = 0, low_size = 0; unsigned long total_low_mem; - unsigned long long base; - bool auto_set = false; int ret; - total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, - &low_size, &base); - if (ret != 0) { + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); + if (ret) { /* * two parts from lib/swiotlb.c: * -swiotlb size: user-specified with swiotlb= or default. @@ -517,52 +518,52 @@ static void __init reserve_crashkernel_low(void) * make sure we allocate enough extra low memory so that we * don't run out of DMA buffers for 32-bit devices. */ - low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20); - auto_set = true; + low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); } else { /* passed with crashkernel=0,low ? */ if (!low_size) - return; + return 0; } - low_base = memblock_find_in_range(low_size, (1ULL<<32), - low_size, alignment); - + low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN); if (!low_base) { - if (!auto_set) - pr_info("crashkernel low reservation failed - No suitable area found.\n"); + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", + (unsigned long)(low_size >> 20)); + return -ENOMEM; + } - return; + ret = memblock_reserve(low_base, low_size); + if (ret) { + pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); + return ret; } - memblock_reserve(low_base, low_size); pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; insert_resource(&iomem_resource, &crashk_low_res); #endif + return 0; } static void __init reserve_crashkernel(void) { - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; + unsigned long long crash_size, crash_base, total_mem; bool high = false; int ret; total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) { /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, - &crash_size, &crash_base); + &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) return; high = true; @@ -573,11 +574,10 @@ static void __init reserve_crashkernel(void) /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ - crash_base = memblock_find_in_range(alignment, - high ? CRASH_KERNEL_ADDR_HIGH_MAX : - CRASH_KERNEL_ADDR_LOW_MAX, - crash_size, alignment); - + crash_base = memblock_find_in_range(CRASH_ALIGN, + high ? CRASH_ADDR_HIGH_MAX + : CRASH_ADDR_LOW_MAX, + crash_size, CRASH_ALIGN); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -587,26 +587,32 @@ static void __init reserve_crashkernel(void) unsigned long long start; start = memblock_find_in_range(crash_base, - crash_base + crash_size, crash_size, 1<<20); + crash_base + crash_size, + crash_size, 1 << 20); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - memblock_reserve(crash_base, crash_size); + ret = memblock_reserve(crash_base, crash_size); + if (ret) { + pr_err("%s: Error reserving crashkernel memblock.\n", __func__); + return; + } - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); + if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { + memblock_free(crash_base, crash_size); + return; + } + + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); - - if (crash_base >= (1ULL<<32)) - reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) @@ -1042,6 +1048,8 @@ void __init setup_arch(char **cmdline_p) if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820_end_of_ram_pfn(); + max_possible_pfn = max_pfn; + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); @@ -1079,8 +1087,10 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) + if (efi_enabled(EFI_BOOT)) { + efi_fake_memmap(); efi_find_mirror(); + } /* * The EFI specification says that boot service code won't be called @@ -1180,7 +1190,7 @@ void __init setup_arch(char **cmdline_p) */ clone_pgd_range(initial_page_table, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); #endif tboot_probe(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6bb5c7f..cb6282c3638f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -63,6 +63,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { + unsigned long buf_val; void __user *buf; unsigned int tmpflags; unsigned int err = 0; @@ -107,7 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ - get_user_ex(buf, &sc->fpstate); + get_user_ex(buf_val, &sc->fpstate); + buf = (void __user *)buf_val; } get_user_catch(err); err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); @@ -196,7 +198,7 @@ static unsigned long align_sigframe(unsigned long sp) return sp; } -static inline void __user * +static void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { @@ -299,7 +301,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -363,7 +365,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); @@ -688,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(failed, ksig, stepping); } -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) + return __NR_restart_syscall; +#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ + return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : + __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +} /* * Note that 'init' is a special process: it doesn't get signals it doesn't @@ -722,7 +727,7 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; + regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 12c8286206ce..658777cf3851 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -125,12 +125,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); + apic->send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5ecbc..24d57f77b3c1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -304,7 +304,7 @@ do { \ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } @@ -629,13 +630,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) num_starts = 0; /* - * Paravirt / VMI wants a startup IPI hook here to set up the - * target processor state. - */ - startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - stack_start); - - /* * Run STARTUP IPI loop. */ pr_debug("#startup loops: %d\n", num_starts); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 346eec73f7db..ade185a46b1d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; siginfo_t *info; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); @@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * which is all zeros which indicates MPX was not * responsible for the exception. */ - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c3f7602cd038..3d743da828d3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -168,21 +168,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div - * into a shift. + * into a shift. The larger SC is, the more accurate the conversion, but + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication + * (64-bit result) can be used. * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - static void cyc2ns_data_init(struct cyc2ns_data *data) { data->cyc2ns_mul = 0; - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; data->__count = 0; } @@ -216,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) if (likely(data == tail)) { ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); } else { data->__count++; barrier(); ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); barrier(); @@ -257,12 +256,22 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - data->cyc2ns_mul = - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, - cpu_khz); - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + NSEC_PER_MSEC, 0); + + /* + * cyc2ns_shift is exported via arch_perf_update_userpage() where it is + * not expected to be greater than 31 due to the original published + * conversion algorithm shifting a 32-bit value (now specifies a 64-bit + * value) - refer perf_event_mmap_page documentation in perf_event.h. + */ + if (data->cyc2ns_shift == 32) { + data->cyc2ns_shift = 31; + data->cyc2ns_mul >>= 1; + } + data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); cyc2ns_write_end(cpu, data); @@ -1176,8 +1185,6 @@ void __init tsc_init(void) u64 lpj; int cpu; - x86_init.timers.tsc_pre_init(); - if (!cpu_has_tsc) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index b9242bacbe59..07efb35ee4bc 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -34,10 +34,11 @@ #include <asm/msr-index.h> verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +#ifndef __x86_64__ pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -47,30 +48,31 @@ verify_cpu: pushfl popl %eax cmpl %eax,%ebx - jz verify_cpu_no_longmode # cpu has no cpuid + jz .Lverify_cpu_no_longmode # cpu has no cpuid +#endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid cmpl $0x1,%eax - jb verify_cpu_no_longmode # no cpuid 1 + jb .Lverify_cpu_no_longmode # no cpuid 1 xor %di,%di cmpl $0x68747541,%ebx # AuthenticAMD - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd cmpl $0x69746e65,%edx - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd cmpl $0x444d4163,%ecx - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd mov $1,%di # cpu is from AMD - jmp verify_cpu_check + jmp .Lverify_cpu_check -verify_cpu_noamd: +.Lverify_cpu_noamd: cmpl $0x756e6547,%ebx # GenuineIntel? - jnz verify_cpu_check + jnz .Lverify_cpu_check cmpl $0x49656e69,%edx - jnz verify_cpu_check + jnz .Lverify_cpu_check cmpl $0x6c65746e,%ecx - jnz verify_cpu_check + jnz .Lverify_cpu_check # only call IA32_MISC_ENABLE when: # family > 6 || (family == 6 && model >= 0xd) @@ -81,59 +83,59 @@ verify_cpu_noamd: andl $0x0ff00f00, %eax # mask family and extended family shrl $8, %eax cmpl $6, %eax - ja verify_cpu_clear_xd # family > 6, ok - jb verify_cpu_check # family < 6, skip + ja .Lverify_cpu_clear_xd # family > 6, ok + jb .Lverify_cpu_check # family < 6, skip andl $0x000f00f0, %ecx # mask model and extended model shrl $4, %ecx cmpl $0xd, %ecx - jb verify_cpu_check # family == 6, model < 0xd, skip + jb .Lverify_cpu_check # family == 6, model < 0xd, skip -verify_cpu_clear_xd: +.Lverify_cpu_clear_xd: movl $MSR_IA32_MISC_ENABLE, %ecx rdmsr btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE - jnc verify_cpu_check # only write MSR if bit was changed + jnc .Lverify_cpu_check # only write MSR if bit was changed wrmsr -verify_cpu_check: +.Lverify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx xorl $REQUIRED_MASK0,%edx - jnz verify_cpu_no_longmode + jnz .Lverify_cpu_no_longmode movl $0x80000000,%eax # See if extended cpuid is implemented cpuid cmpl $0x80000001,%eax - jb verify_cpu_no_longmode # no extended cpuid + jb .Lverify_cpu_no_longmode # no extended cpuid movl $0x80000001,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK1,%edx xorl $REQUIRED_MASK1,%edx - jnz verify_cpu_no_longmode + jnz .Lverify_cpu_no_longmode -verify_cpu_sse_test: +.Lverify_cpu_sse_test: movl $1,%eax cpuid andl $SSE_MASK,%edx cmpl $SSE_MASK,%edx - je verify_cpu_sse_ok + je .Lverify_cpu_sse_ok test %di,%di - jz verify_cpu_no_longmode # only try to force SSE on AMD + jz .Lverify_cpu_no_longmode # only try to force SSE on AMD movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr xor %di,%di # don't loop - jmp verify_cpu_sse_test # try again + jmp .Lverify_cpu_sse_test # try again -verify_cpu_no_longmode: - popfl # Restore caller passed flags +.Lverify_cpu_no_longmode: + popf # Restore caller passed flags movl $1,%eax ret -verify_cpu_sse_ok: - popfl # Restore caller passed flags +.Lverify_cpu_sse_ok: + popf # Restore caller passed flags xorl %eax, %eax ret diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 524619351961..e574b8546518 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd_mm(mm, 0xA0000, pmd); + + if (pmd_trans_huge(*pmd)) { + struct vm_area_struct *vma = find_vma(mm, 0xA0000); + split_huge_pmd(vma, pmd, 0xA0000); + } if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); @@ -357,8 +361,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (cpu_has_sep) + + if (static_cpu_has_safe(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; + load_sp0(tss, &tsk->thread); put_cpu(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3839628d962e..dad5fe9633a3 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -68,7 +68,6 @@ struct x86_init_ops x86_init __initdata = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, - .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, .wallclock_init = x86_init_noop, }, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d8a1d56276e1..639a6e34500c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,8 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index d090ecf08809..9dc091acd5fb 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -21,6 +21,7 @@ #include <linux/fs.h> #include "irq.h" #include "assigned-dev.h" +#include "trace/events/kvm.h" struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; @@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef __KVM_HAVE_MSI +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, + int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + + static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) return IRQ_HANDLED; } -#endif -#ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) return IRQ_HANDLED; } -#endif /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) @@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_host_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_host_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -443,8 +473,6 @@ err: return r; } -#endif - static int assigned_device_enable_guest_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_guest_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_guest_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif static int assign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, @@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm, case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_HOST_MSI: r = assigned_device_enable_host_msi(kvm, dev); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_HOST_MSIX: r = assigned_device_enable_host_msix(kvm, dev); break; -#endif default: r = -EINVAL; } @@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm, case KVM_DEV_IRQ_GUEST_INTX: r = assigned_device_enable_guest_intx(kvm, dev, irq); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_GUEST_MSI: r = assigned_device_enable_guest_msi(kvm, dev, irq); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_GUEST_MSIX: r = assigned_device_enable_guest_msix(kvm, dev, irq); break; -#endif default: r = -EINVAL; } @@ -826,7 +842,6 @@ out: } -#ifdef __KVM_HAVE_MSIX static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, struct kvm_assigned_msix_nr *entry_nr) { @@ -906,7 +921,6 @@ msix_entry_out: return r; } -#endif static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) @@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; r = -EFAULT; @@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif case KVM_ASSIGN_SET_INTX_MASK: { struct kvm_assigned_pci_dev assigned_dev; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..6525e926f566 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -30,7 +30,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= XSTATE_EXTEND_MASK; + xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; @@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void) u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; if (!kvm_x86_ops->mpx_supported()) - xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); return xcr0; } @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dd05b9cef6ae..c8eda1498121 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_CPUID_H #include "x86.h" +#include <asm/cpu.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, @@ -38,6 +39,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->edx & bit(X86_FEATURE_MTRR)); +} + static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -133,4 +142,74 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_MPX)); } + +static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); +} + +static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); +} + +/* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +#define BIT_NRIPS 3 + +static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); + + /* + * NRIPS is a scattered cpuid feature, so we can't use + * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit + * position 8, not 3). + */ + return best && (best->edx & bit(BIT_NRIPS)); +} +#undef BIT_NRIPS + +static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (!best) + return -1; + + return x86_family(best->eax); +} + +static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (!best) + return -1; + + return x86_model(best->eax); +} + +static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (!best) + return -1; + + return x86_stepping(best->eax); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9da95b9daf8d..1505587d06e9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) #define GET_SMSTATE(type, smbase, offset) \ ({ \ type __val; \ - int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ - sizeof(__val), NULL); \ + int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ if (r != X86EMUL_CONTINUE) \ return X86EMUL_UNHANDLEABLE; \ __val; \ @@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed - * to read_std/write_std are not virtual. - * - * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + if (cr4 & X86_CR4_PCIDE) { + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + cr4 &= ~X86_CR4_PCIDE; + } + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - cr4 = ctxt->ops->get_cr(ctxt, 4); + + /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); @@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), + II(EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8160d2ae362..c58ba67175ac 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,13 +23,665 @@ #include "x86.h" #include "lapic.h" +#include "ioapic.h" #include "hyperv.h" #include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <asm/apicdef.h> #include <trace/events/kvm.h> #include "trace.h" +static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) +{ + return atomic64_read(&synic->sint[sint]); +} + +static inline int synic_get_sint_vector(u64 sint_value) +{ + if (sint_value & HV_SYNIC_SINT_MASKED) + return -1; + return sint_value & HV_SYNIC_SINT_VECTOR_MASK; +} + +static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, + int vector) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) + return true; + } + return false; +} + +static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, + int vector) +{ + int i; + u64 sint_value; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + sint_value = synic_read_sint(synic, i); + if (synic_get_sint_vector(sint_value) == vector && + sint_value & HV_SYNIC_SINT_AUTO_EOI) + return true; + } + return false; +} + +static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, + u64 data, bool host) +{ + int vector; + + vector = data & HV_SYNIC_SINT_VECTOR_MASK; + if (vector < 16 && !host) + return 1; + /* + * Guest may configure multiple SINTs to use the same vector, so + * we maintain a bitmap of vectors handled by synic, and a + * bitmap of vectors with auto-eoi behavior. The bitmaps are + * updated here, and atomically queried on fast paths. + */ + + atomic64_set(&synic->sint[sint], data); + + if (synic_has_vector_connected(synic, vector)) + __set_bit(vector, synic->vec_bitmap); + else + __clear_bit(vector, synic->vec_bitmap); + + if (synic_has_vector_auto_eoi(synic, vector)) + __set_bit(vector, synic->auto_eoi_bitmap); + else + __clear_bit(vector, synic->auto_eoi_bitmap); + + /* Load SynIC vectors into EOI exit bitmap */ + kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); + return 0; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_hv_synic *synic; + + if (vcpu_id >= atomic_read(&kvm->online_vcpus)) + return NULL; + vcpu = kvm_get_vcpu(kvm, vcpu_id); + if (!vcpu) + return NULL; + synic = vcpu_to_synic(vcpu); + return (synic->active) ? synic : NULL; +} + +static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, + u32 sint) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct page *page; + gpa_t gpa; + struct hv_message *msg; + struct hv_message_page *msg_page; + + gpa = synic->msg_page & PAGE_MASK; + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", + gpa); + return; + } + msg_page = kmap_atomic(page); + + msg = &msg_page->sint_message[sint]; + msg->header.message_flags.msg_pending = 0; + + kunmap_atomic(msg_page); + kvm_release_page_dirty(page); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); +} + +static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_stimer *stimer; + int gsi, idx, stimers_pending; + + trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); + + if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) + synic_clear_sint_msg_pending(synic, sint); + + /* Try to deliver pending Hyper-V SynIC timers messages */ + stimers_pending = 0; + for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { + stimer = &hv_vcpu->stimer[idx]; + if (stimer->msg_pending && + (stimer->config & HV_STIMER_ENABLE) && + HV_STIMER_SINT(stimer->config) == sint) { + set_bit(stimer->index, + hv_vcpu->stimer_pending_bitmap); + stimers_pending++; + } + } + if (stimers_pending) + kvm_make_request(KVM_REQ_HV_STIMER, vcpu); + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = atomic_read(&synic->sint_to_gsi[sint]); + if (gsi != -1) + kvm_notify_acked_gsi(kvm, gsi); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; + hv_vcpu->exit.u.synic.msr = msr; + hv_vcpu->exit.u.synic.control = synic->control; + hv_vcpu->exit.u.synic.evt_page = synic->evt_page; + hv_vcpu->exit.u.synic.msg_page = synic->msg_page; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, + u32 msr, u64 data, bool host) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + int ret; + + if (!synic->active) + return 1; + + trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); + + ret = 0; + switch (msr) { + case HV_X64_MSR_SCONTROL: + synic->control = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_SVERSION: + if (!host) { + ret = 1; + break; + } + synic->version = data; + break; + case HV_X64_MSR_SIEFP: + if (data & HV_SYNIC_SIEFP_ENABLE) + if (kvm_clear_guest(vcpu->kvm, + data & PAGE_MASK, PAGE_SIZE)) { + ret = 1; + break; + } + synic->evt_page = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_SIMP: + if (data & HV_SYNIC_SIMP_ENABLE) + if (kvm_clear_guest(vcpu->kvm, + data & PAGE_MASK, PAGE_SIZE)) { + ret = 1; + break; + } + synic->msg_page = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_EOM: { + int i; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) + kvm_hv_notify_acked_sint(vcpu, i); + break; + } + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); + break; + default: + ret = 1; + break; + } + return ret; +} + +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +{ + int ret; + + if (!synic->active) + return 1; + + ret = 0; + switch (msr) { + case HV_X64_MSR_SCONTROL: + *pdata = synic->control; + break; + case HV_X64_MSR_SVERSION: + *pdata = synic->version; + break; + case HV_X64_MSR_SIEFP: + *pdata = synic->evt_page; + break; + case HV_X64_MSR_SIMP: + *pdata = synic->msg_page; + break; + case HV_X64_MSR_EOM: + *pdata = 0; + break; + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); + break; + default: + ret = 1; + break; + } + return ret; +} + +int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_lapic_irq irq; + int ret, vector; + + if (sint >= ARRAY_SIZE(synic->sint)) + return -EINVAL; + + vector = synic_get_sint_vector(synic_read_sint(synic, sint)); + if (vector < 0) + return -ENOENT; + + memset(&irq, 0, sizeof(irq)); + irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.dest_mode = APIC_DEST_PHYSICAL; + irq.delivery_mode = APIC_DM_FIXED; + irq.vector = vector; + irq.level = 1; + + ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); + return ret; +} + +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +{ + struct kvm_vcpu_hv_synic *synic; + + synic = synic_get(kvm, vcpu_id); + if (!synic) + return -EINVAL; + + return synic_set_irq(synic, sint); +} + +void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + int i; + + trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) + if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) + kvm_hv_notify_acked_sint(vcpu, i); +} + +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +{ + struct kvm_vcpu_hv_synic *synic; + + synic = synic_get(kvm, vcpu_id); + if (!synic) + return -EINVAL; + + if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) + return -EINVAL; + + atomic_set(&synic->sint_to_gsi[sint], gsi); + return 0; +} + +void kvm_hv_irq_routing_update(struct kvm *kvm) +{ + struct kvm_irq_routing_table *irq_rt; + struct kvm_kernel_irq_routing_entry *e; + u32 gsi; + + irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, + lockdep_is_held(&kvm->irq_lock)); + + for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { + hlist_for_each_entry(e, &irq_rt->map[gsi], link) { + if (e->type == KVM_IRQ_ROUTING_HV_SINT) + kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, + e->hv_sint.sint, gsi); + } + } +} + +static void synic_init(struct kvm_vcpu_hv_synic *synic) +{ + int i; + + memset(synic, 0, sizeof(*synic)); + synic->version = HV_SYNIC_VERSION_1; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); + atomic_set(&synic->sint_to_gsi[i], -1); + } +} + +static u64 get_time_ref_counter(struct kvm *kvm) +{ + return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); +} + +static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, + bool vcpu_kick) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + + set_bit(stimer->index, + vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + kvm_make_request(KVM_REQ_HV_STIMER, vcpu); + if (vcpu_kick) + kvm_vcpu_kick(vcpu); +} + +static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + + trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index); + + hrtimer_cancel(&stimer->timer); + clear_bit(stimer->index, + vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + stimer->msg_pending = false; + stimer->exp_time = 0; +} + +static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) +{ + struct kvm_vcpu_hv_stimer *stimer; + + stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); + trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index); + stimer_mark_pending(stimer, true); + + return HRTIMER_NORESTART; +} + +/* + * stimer_start() assumptions: + * a) stimer->count is not equal to 0 + * b) stimer->config has HV_STIMER_ENABLE flag + */ +static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) +{ + u64 time_now; + ktime_t ktime_now; + + time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); + ktime_now = ktime_get(); + + if (stimer->config & HV_STIMER_PERIODIC) { + if (stimer->exp_time) { + if (time_now >= stimer->exp_time) { + u64 remainder; + + div64_u64_rem(time_now - stimer->exp_time, + stimer->count, &remainder); + stimer->exp_time = + time_now + (stimer->count - remainder); + } + } else + stimer->exp_time = time_now + stimer->count; + + trace_kvm_hv_stimer_start_periodic( + stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, + time_now, stimer->exp_time); + + hrtimer_start(&stimer->timer, + ktime_add_ns(ktime_now, + 100 * (stimer->exp_time - time_now)), + HRTIMER_MODE_ABS); + return 0; + } + stimer->exp_time = stimer->count; + if (time_now >= stimer->count) { + /* + * Expire timer according to Hypervisor Top-Level Functional + * specification v4(15.3.1): + * "If a one shot is enabled and the specified count is in + * the past, it will expire immediately." + */ + stimer_mark_pending(stimer, false); + return 0; + } + + trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, + time_now, stimer->count); + + hrtimer_start(&stimer->timer, + ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), + HRTIMER_MODE_ABS); + return 0; +} + +static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, + bool host) +{ + trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, config, host); + + stimer_cleanup(stimer); + if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) + config &= ~HV_STIMER_ENABLE; + stimer->config = config; + stimer_mark_pending(stimer, false); + return 0; +} + +static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, + bool host) +{ + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, count, host); + + stimer_cleanup(stimer); + stimer->count = count; + if (stimer->count == 0) + stimer->config &= ~HV_STIMER_ENABLE; + else if (stimer->config & HV_STIMER_AUTOENABLE) + stimer->config |= HV_STIMER_ENABLE; + stimer_mark_pending(stimer, false); + return 0; +} + +static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) +{ + *pconfig = stimer->config; + return 0; +} + +static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) +{ + *pcount = stimer->count; + return 0; +} + +static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, + struct hv_message *src_msg) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct page *page; + gpa_t gpa; + struct hv_message *dst_msg; + int r; + struct hv_message_page *msg_page; + + if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) + return -ENOENT; + + gpa = synic->msg_page & PAGE_MASK; + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); + if (is_error_page(page)) + return -EFAULT; + + msg_page = kmap_atomic(page); + dst_msg = &msg_page->sint_message[sint]; + if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, + src_msg->header.message_type) != HVMSG_NONE) { + dst_msg->header.message_flags.msg_pending = 1; + r = -EAGAIN; + } else { + memcpy(&dst_msg->u.payload, &src_msg->u.payload, + src_msg->header.payload_size); + dst_msg->header.message_type = src_msg->header.message_type; + dst_msg->header.payload_size = src_msg->header.payload_size; + r = synic_set_irq(synic, sint); + if (r >= 1) + r = 0; + else if (r == 0) + r = -EFAULT; + } + kunmap_atomic(msg_page); + kvm_release_page_dirty(page); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); + return r; +} + +static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct hv_message *msg = &stimer->msg; + struct hv_timer_message_payload *payload = + (struct hv_timer_message_payload *)&msg->u.payload; + + payload->expiration_time = stimer->exp_time; + payload->delivery_time = get_time_ref_counter(vcpu->kvm); + return synic_deliver_msg(vcpu_to_synic(vcpu), + HV_STIMER_SINT(stimer->config), msg); +} + +static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) +{ + int r; + + stimer->msg_pending = true; + r = stimer_send_msg(stimer); + trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, r); + if (!r) { + stimer->msg_pending = false; + if (!(stimer->config & HV_STIMER_PERIODIC)) + stimer->config &= ~HV_STIMER_ENABLE; + } +} + +void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_stimer *stimer; + u64 time_now, exp_time; + int i; + + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { + stimer = &hv_vcpu->stimer[i]; + if (stimer->config & HV_STIMER_ENABLE) { + exp_time = stimer->exp_time; + + if (exp_time) { + time_now = + get_time_ref_counter(vcpu->kvm); + if (time_now >= exp_time) + stimer_expiration(stimer); + } + + if ((stimer->config & HV_STIMER_ENABLE) && + stimer->count) + stimer_start(stimer); + else + stimer_cleanup(stimer); + } + } +} + +void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + int i; + + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + stimer_cleanup(&hv_vcpu->stimer[i]); +} + +static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) +{ + struct hv_message *msg = &stimer->msg; + struct hv_timer_message_payload *payload = + (struct hv_timer_message_payload *)&msg->u.payload; + + memset(&msg->header, 0, sizeof(msg->header)); + msg->header.message_type = HVMSG_TIMER_EXPIRED; + msg->header.payload_size = sizeof(*payload); + + payload->timer_index = stimer->index; + payload->expiration_time = 0; + payload->delivery_time = 0; +} + +static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) +{ + memset(stimer, 0, sizeof(*stimer)); + stimer->index = timer_index; + hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + stimer->timer.function = stimer_timer_callback; + stimer_prepare_msg(stimer); +} + +void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + int i; + + synic_init(&hv_vcpu->synic); + + bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + stimer_init(&hv_vcpu->stimer[i], i); +} + +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +{ + /* + * Hyper-V SynIC auto EOI SINT's are + * not compatible with APICV, so deactivate APICV + */ + kvm_vcpu_deactivate_apicv(vcpu); + vcpu_to_synic(vcpu)->active = true; + return 0; +} + static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; @@ -41,6 +693,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_RESET: r = true; break; } @@ -163,6 +816,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, data); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + case HV_X64_MSR_RESET: + if (data == 1) { + vcpu_debug(vcpu, "hyper-v reset requested\n"); + kvm_make_request(KVM_REQ_HV_RESET, vcpu); + } + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -171,7 +830,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 0; } -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +/* Calculate cpu time spent by current task in 100ns units */ +static u64 current_task_runtime_100ns(void) +{ + cputime_t utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return div_u64(cputime_to_nsecs(utime + stime), 100); +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -205,6 +873,36 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + case HV_X64_MSR_VP_RUNTIME: + if (!host) + return 1; + hv->runtime_offset = data - current_task_runtime_100ns(); + break; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: { + int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; + + return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), + data, host); + } + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: { + int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; + + return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), + data, host); + } default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -227,11 +925,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; - case HV_X64_MSR_TIME_REF_COUNT: { - data = - div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + case HV_X64_MSR_TIME_REF_COUNT: + data = get_time_ref_counter(kvm); break; - } case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; @@ -241,6 +937,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + case HV_X64_MSR_RESET: + data = 0; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -277,6 +976,34 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_APIC_ASSIST_PAGE: data = hv->hv_vapic; break; + case HV_X64_MSR_VP_RUNTIME: + data = current_task_runtime_100ns() + hv->runtime_offset; + break; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: { + int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; + + return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), + pdata); + } + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: { + int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; + + return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), + pdata); + } default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -295,7 +1022,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) mutex_unlock(&vcpu->kvm->lock); return r; } else - return kvm_hv_set_msr(vcpu, msr, data); + return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index c7bce559f67b..60eccd4bd1d3 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -24,9 +24,64 @@ #ifndef __ARCH_X86_KVM_HYPERV_H__ #define __ARCH_X86_KVM_HYPERV_H__ +static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) +{ + return &vcpu->arch.hyperv; +} + +static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) +{ + struct kvm_vcpu_arch *arch; + + arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); + return container_of(arch, struct kvm_vcpu, arch); +} + +static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) +{ + return &vcpu->arch.hyperv.synic; +} + +static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) +{ + return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); + bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); +void kvm_hv_irq_routing_update(struct kvm *kvm); +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); +void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); + +void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); + +static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, + int timer_index) +{ + return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; +} + +static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu_hv *hv_vcpu; + + hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, + stimer[0]); + return hv_vcpu_to_vcpu(hv_vcpu); +} + +static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) +{ + return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, + HV_SYNIC_STIMER_COUNT); +} + +void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f90952f64e79..b0ea42b78ccd 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -35,6 +35,7 @@ #include <linux/kvm_host.h> #include <linux/slab.h> +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; s64 interval; - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) + if (!ioapic_in_kernel(kvm) || + ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -418,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s u8 saved_mode; if (hpet_legacy_start) { /* save existing mode for later reenablement */ + WARN_ON(channel != 0); saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ pit_load_count(kvm, channel, val); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 856f79105bb5..1facfd60b04a 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -static void update_handled_vectors(struct kvm_ioapic *ioapic) -{ - DECLARE_BITMAP(handled_vectors, 256); - int i; - - memset(handled_vectors, 0, sizeof(handled_vectors)); - for (i = 0; i < IOAPIC_NUM_PINS; ++i) - __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); - memcpy(ioapic->handled_vectors, handled_vectors, - sizeof(handled_vectors)); - smp_wmb(); -} - -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) { + e->fields.dest_id, e->fields.dest_mode) || + (e->fields.trig_mode == IOAPIC_EDGE_TRIG && + kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, - (unsigned long *)eoi_exit_bitmap); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) - __set_bit(e->fields.vector, - (unsigned long *)tmr); - } + ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); @@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } - update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); - update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); + return ret; } + kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ca0b0b4e6256..2d16dc251d81 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -9,6 +9,7 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -73,7 +74,6 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; - DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; @@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +static inline int ioapic_in_kernel(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); + int ret; + + ret = (ioapic_irqchip(kvm) != NULL); + return ret; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -120,7 +121,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr); - +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 5c520ebf6343..a22a488b4622 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, +static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, unsigned long npages) { gfn_t end_gfn; - pfn_t pfn; + kvm_pfn_t pfn; pfn = gfn_to_pfn_memslot(slot, gfn); end_gfn = gfn + npages; @@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, return pfn; } -static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long npages) { unsigned long i; @@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { gfn_t gfn, end_gfn; - pfn_t pfn; + kvm_pfn_t pfn; int r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; int flags; @@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm, { struct iommu_domain *domain; gfn_t end_gfn, gfn; - pfn_t pfn; + kvm_pfn_t pfn; u64 phys; domain = kvm->arch.iommu_domain; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a1ec6a50a05a..3982b479bb5f 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* + * check if there is a pending userspace external interrupt + */ +static int pending_userspace_extint(struct kvm_vcpu *v) +{ + return v->arch.pending_external_vector != -1; +} + +/* * check if there is pending interrupt from * non-APIC source without intack. */ static int kvm_cpu_has_extint(struct kvm_vcpu *v) { - if (kvm_apic_accept_pic_intr(v)) - return pic_irqchip(v->kvm)->output; /* PIC */ - else + u8 accept = kvm_apic_accept_pic_intr(v); + + if (accept) { + if (irqchip_split(v->kvm)) + return pending_userspace_extint(v); + else + return pic_irqchip(v->kvm)->output; + } else return 0; } @@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) return 1; - if (kvm_apic_vid_enabled(v->kvm)) + if (kvm_vcpu_apicv_active(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ @@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); */ static int kvm_cpu_get_extint(struct kvm_vcpu *v) { - if (kvm_cpu_has_extint(v)) - return kvm_pic_read_irq(v->kvm); /* PIC */ - return -1; + if (kvm_cpu_has_extint(v)) { + if (irqchip_split(v->kvm)) { + int vector = v->arch.pending_external_vector; + + v->arch.pending_external_vector = -1; + return vector; + } else + return kvm_pic_read_irq(v->kvm); /* PIC */ + } else + return -1; } /* @@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { int vector; - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; vector = kvm_cpu_get_extint(v); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 3d782a2c336a..ae5c78f2337d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + return ret; +} + +static inline int irqchip_split(struct kvm *kvm) +{ + return kvm->arch.irqchip_split; +} + static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); + bool ret; + + ret = (vpic != NULL); + ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return vpic != NULL; + return ret; +} + +static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) +{ + /* Same as irqchip_in_kernel(vcpu->kvm), but with less + * pointer chasing and no unnecessary memory barriers. + */ + return vcpu->arch.apic != NULL; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9efff9e5b58c..8fc89efb5250 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -33,6 +33,8 @@ #include "lapic.h" +#include "hyperv.h" + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -91,8 +93,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); @@ -108,6 +110,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -123,12 +126,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { struct kvm_lapic_irq irq; int r; + if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) + return -EWOULDBLOCK; + kvm_set_msi_irq(e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) @@ -137,42 +144,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - if (likely(e->type == KVM_IRQ_ROUTING_MSI)) - ret = kvm_set_msi_inatomic(e, kvm); - else - ret = -EWOULDBLOCK; - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; @@ -208,7 +179,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_in_kernel(kvm)) + if (!ioapic_in_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); @@ -250,6 +221,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -1; + + return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); +} + int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -288,6 +269,11 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; break; + case KVM_IRQ_ROUTING_HV_SINT: + e->set = kvm_hv_set_sint; + e->hv_sint.vcpu = ue->u.hv_sint.vcpu; + e->hv_sint.sint = ue->u.hv_sint.sint; + break; default: goto out; } @@ -297,6 +283,33 @@ out: return r; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } @@ -328,3 +341,72 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, default_routing, ARRAY_SIZE(default_routing), 0); } + +static const struct kvm_irq_routing_entry empty_routing[] = {}; + +int kvm_setup_empty_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, empty_routing, 0, 0); +} + +void kvm_arch_post_irq_routing_update(struct kvm *kvm) +{ + if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + return; + kvm_make_scan_ioapic_request(kvm); +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + /* kvm->irq_routing must be read after clearing + * KVM_SCAN_IOAPIC. */ + smp_mb(); + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + u32 dest_id, dest_mode; + bool level; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + dest_id = (entry->msi.address_lo >> 12) & 0xff; + dest_mode = (entry->msi.address_lo >> 2) & 0x1; + level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; + if (level && kvm_apic_match_dest(vcpu, NULL, 0, + dest_id, dest_mode)) { + u32 vector = entry->msi.data & 0xff; + + __set_bit(vector, + ioapic_handled_vectors); + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + switch (irq->type) { + case KVM_IRQ_ROUTING_HV_SINT: + return kvm_hv_set_sint(irq, kvm, irq_source_id, level, + line_status); + default: + return -EWOULDBLOCK; + } +} + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + kvm_hv_irq_routing_update(kvm); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8d9013c5e1ee..36591faed13b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -41,6 +41,7 @@ #include "trace.h" #include "x86.h" #include "cpuid.h" +#include "hyperv.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -128,11 +129,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline int kvm_apic_id(struct kvm_lapic *apic) -{ - return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; -} - /* The logical map is definitely wrong if we have multiple * modes at the same time. (Physical map is always right.) */ @@ -209,7 +205,7 @@ out: if (old) kfree_rcu(old, rcu); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) @@ -348,6 +344,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); @@ -377,7 +375,8 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + if (apic->vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -390,7 +389,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { + if (unlikely(vcpu->arch.apicv_active)) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -416,7 +415,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_x86_ops->hwapic_isr_update)) + if (unlikely(vcpu->arch.apicv_active)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -464,7 +463,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_x86_ops->hwapic_isr_update)) + if (unlikely(vcpu->arch.apicv_active)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -551,15 +550,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - for (i = 0; i < 8; i++) - apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -764,6 +754,65 @@ out: return ret; } +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + struct kvm_apic_map *map; + bool ret = false; + struct kvm_lapic *dst = NULL; + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id == 0xFF) + goto out; + + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) + goto out; + + dst = map->phys_map[irq->dest_id]; + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } else { + u16 cid; + unsigned long bitmap = 1; + int i, r = 0; + + if (!kvm_apic_logical_map_valid(map)) + goto out; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; + + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. @@ -781,6 +830,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; case APIC_DM_FIXED: + if (unlikely(trig_mode && !level)) + break; + /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; @@ -790,7 +842,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) + if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { + if (trig_mode) + apic_set_vector(vector, apic->regs + APIC_TMR); + else + apic_clear_vector(vector, apic->regs + APIC_TMR); + } + + if (vcpu->arch.apicv_active) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { apic_set_irr(vector, apic); @@ -868,16 +927,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) +{ + return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); +} + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + int trigger_mode; + + /* Eoi the ioapic only if the ioapic doesn't own the vector. */ + if (!kvm_ioapic_handles_vector(apic, vector)) + return; + + /* Request a KVM exit to inform the userspace IOAPIC. */ + if (irqchip_split(apic->vcpu->kvm)) { + apic->vcpu->arch.pending_ioapic_eoi = vector; + kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); + return; } + + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) @@ -896,6 +971,9 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); + if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) + kvm_hv_synic_send_eoi(apic->vcpu, vector); + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; @@ -1147,7 +1225,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (kvm_x86_ops->deliver_posted_interrupt) + if (vcpu->arch.apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1172,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ @@ -1240,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -1615,8 +1693,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); - apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; + apic->irr_pending = vcpu->arch.apicv_active; + apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1805,6 +1883,12 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); + + if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { + apic_clear_isr(vector, apic); + apic_update_ppr(apic); + } + return vector; } @@ -1828,17 +1912,20 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = kvm_x86_ops->hwapic_isr_update ? + apic->isr_count = vcpu->arch.apicv_active ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; - if (kvm_x86_ops->hwapic_irr_update) + if (vcpu->arch.apicv_active) { kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + } kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_rtc_eoi_tracking_restore_one(vcpu); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_rtc_eoi_tracking_restore_one(vcpu); + + vcpu->arch.apic_arb_prio = 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) @@ -1922,7 +2009,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. @@ -1978,7 +2065,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_ICR2) @@ -1995,7 +2082,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 764037991d26..41bdb35b4b67 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, @@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vm_has_apicv(kvm); + return vcpu->arch.apic && vcpu->arch.apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) @@ -165,8 +164,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } +static inline int kvm_apic_id(struct kvm_lapic *apic) +{ + return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f507913..95a955de5964 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte) } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - pfn_t pfn, unsigned access) + kvm_pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { mark_mmio_spte(vcpu, sptep, gfn, access); @@ -311,11 +311,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_rmap_spte(u64 pte) -{ - return is_shadow_present_pte(pte); -} - static int is_last_spte(u64 pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) @@ -325,7 +320,7 @@ static int is_last_spte(u64 pte, int level) return 0; } -static pfn_t spte_to_pfn(u64 pte) +static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; } @@ -540,7 +535,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; bool ret = false; - WARN_ON(!is_rmap_spte(new_spte)); + WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -587,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static int mmu_spte_clear_track_bits(u64 *sptep) { - pfn_t pfn; + kvm_pfn_t pfn; u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) @@ -595,7 +590,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) else old_spte = __update_clear_spte_slow(sptep, 0ull); - if (!is_rmap_spte(old_spte)) + if (!is_shadow_present_pte(old_spte)) return 0; pfn = spte_to_pfn(old_spte); @@ -818,14 +813,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -834,6 +826,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,21 +869,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -882,43 +897,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; } /* - * Pte mapping structures: - * - * If pte_list bit zero is zero, then pte_list point to the spte. + * About rmap_head encoding: * - * If pte_list bit zero is one, (then pte_list & ~1) points to a struct + * If the bit zero of rmap_head->val is clear, then it points to the only spte + * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. - * - * Returns the number of pte entries before the spte was added or zero if - * the spte was not added. - * + */ + +/* + * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, - unsigned long *pte_list) + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i, count = 0; - if (!*pte_list) { + if (!rmap_head->val) { rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - *pte_list = (unsigned long)spte; - } else if (!(*pte_list & 1)) { + rmap_head->val = (unsigned long)spte; + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); - desc->sptes[0] = (u64 *)*pte_list; + desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; - *pte_list = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; count += PTE_LIST_EXT; @@ -935,8 +949,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, } static void -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, - int i, struct pte_list_desc *prev_desc) +pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, + struct pte_list_desc *desc, int i, + struct pte_list_desc *prev_desc) { int j; @@ -947,43 +962,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, if (j != 0) return; if (!prev_desc && !desc->more) - *pte_list = (unsigned long)desc->sptes[0]; + rmap_head->val = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - *pte_list = (unsigned long)desc->more | 1; + rmap_head->val = (unsigned long)desc->more | 1; mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, unsigned long *pte_list) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; - if (!*pte_list) { + if (!rmap_head->val) { printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); BUG(); - } else if (!(*pte_list & 1)) { + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_remove: %p 1->0\n", spte); - if ((u64 *)*pte_list != spte) { + if ((u64 *)rmap_head->val != spte) { printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); BUG(); } - *pte_list = 0; + rmap_head->val = 0; } else { rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(pte_list, - desc, i, - prev_desc); + pte_list_desc_remove_entry(rmap_head, + desc, i, prev_desc); return; } + } prev_desc = desc; desc = desc->more; } @@ -992,28 +1007,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list) } } -typedef void (*pte_list_walk_fn) (u64 *spte); -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) -{ - struct pte_list_desc *desc; - int i; - - if (!*pte_list) - return; - - if (!(*pte_list & 1)) - return fn((u64 *)*pte_list); - - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) - fn(desc->sptes[i]); - desc = desc->more; - } -} - -static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { unsigned long idx; @@ -1021,10 +1016,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } -/* - * Take gfn and return the reverse mapping to it. - */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) +static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, + struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; @@ -1045,24 +1038,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); - return pte_list_add(vcpu, spte, rmapp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + return pte_list_add(vcpu, spte, rmap_head); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmapp); + rmap_head = gfn_to_rmap(kvm, gfn, sp); + pte_list_remove(spte, rmap_head); } /* @@ -1082,19 +1075,26 @@ struct rmap_iterator { * * Returns sptep if found, NULL otherwise. */ -static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, + struct rmap_iterator *iter) { - if (!rmap) + u64 *sptep; + + if (!rmap_head->val) return NULL; - if (!(rmap & 1)) { + if (!(rmap_head->val & 1)) { iter->desc = NULL; - return (u64 *)rmap; + sptep = (u64 *)rmap_head->val; + goto out; } - iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); iter->pos = 0; - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } /* @@ -1104,14 +1104,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) */ static u64 *rmap_get_next(struct rmap_iterator *iter) { + u64 *sptep; + if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { - u64 *sptep; - ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) - return sptep; + goto out; } iter->desc = iter->desc->more; @@ -1119,17 +1119,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; + goto out; } } return NULL; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } -#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ - _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ - _spte_ = rmap_get_next(_iter_)) +#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ + _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1187,14 +1190,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, +static bool __rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); return flush; @@ -1211,13 +1215,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); return flush; @@ -1234,13 +1238,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); return flush; @@ -1260,12 +1264,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmapp, false); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1285,12 +1289,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_clear_dirty(kvm, rmapp); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ mask &= mask - 1; @@ -1322,28 +1326,27 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true); } return write_protected; } -static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(*rmapp, &iter))) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); @@ -1353,14 +1356,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmapp); + return kvm_zap_rmapp(kvm, rmap_head); } -static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1369,13 +1372,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, int need_flush = 0; u64 new_spte; pte_t *ptep = (pte_t *)data; - pfn_t new_pfn; + kvm_pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1413,11 +1416,11 @@ struct slot_rmap_walk_iterator { /* output fields. */ gfn_t gfn; - unsigned long *rmap; + struct kvm_rmap_head *rmap; int level; /* private field. */ - unsigned long *end_rmap; + struct kvm_rmap_head *end_rmap; }; static void @@ -1476,7 +1479,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long end, unsigned long data, int (*handler)(struct kvm *kvm, - unsigned long *rmapp, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, @@ -1520,7 +1523,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data)) @@ -1543,7 +1547,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1553,18 +1557,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } + } trace_kvm_age_page(gfn, level, slot, young); return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1580,11 +1585,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; break; } + } out: return young; } @@ -1593,14 +1599,14 @@ out: static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -1700,8 +1706,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) { struct kvm_mmu_page *sp; @@ -1717,8 +1722,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, * this feature. See the comments in kvm_zap_obsolete_pages(). */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - sp->parent_ptes = 0; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1726,7 +1729,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - pte_list_walk(&sp->parent_ptes, mark_unsync); + u64 *sptep; + struct rmap_iterator iter; + + for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { + mark_unsync(sptep); + } } static void mark_unsync(u64 *spte) @@ -1786,6 +1794,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, return (pvec->nr == KVM_PAGE_ARRAY_NR); } +static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) +{ + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); +} + static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { @@ -1795,8 +1810,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (!is_shadow_present_pte(ent) || is_large_pte(ent)) - goto clear_child_bitmap; + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { + clear_unsync_child_bit(sp, i); + continue; + } child = page_header(ent & PT64_BASE_ADDR_MASK); @@ -1805,28 +1822,21 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) - goto clear_child_bitmap; - else if (ret > 0) + if (!ret) { + clear_unsync_child_bit(sp, i); + continue; + } else if (ret > 0) { nr_unsync_leaf += ret; - else + } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else - goto clear_child_bitmap; - - continue; - -clear_child_bitmap: - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); + clear_unsync_child_bit(sp, i); } - return nr_unsync_leaf; } @@ -1989,9 +1999,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); - __clear_bit(idx, sp->unsync_child_bitmap); + clear_unsync_child_bit(sp, idx); level++; } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); } @@ -2033,14 +2041,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } } -static void init_shadow_page_table(struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = 0ull; -} - static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { sp->write_flooding_count = 0; @@ -2063,8 +2063,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int direct, - unsigned access, - u64 *parent_pte) + unsigned access) { union kvm_mmu_page_role role; unsigned quadrant; @@ -2096,21 +2095,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) break; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { + if (sp->unsync_children) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); return sp; } + ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); - if (!sp) - return sp; + + sp = kvm_mmu_alloc_page(vcpu, direct); + sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, @@ -2124,7 +2120,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - init_shadow_page_table(sp); + clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); return sp; } @@ -2178,7 +2174,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) { u64 spte; @@ -2186,12 +2183,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask; - - if (accessed) - spte |= shadow_accessed_mask; + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); + + mmu_page_add_parent_pte(vcpu, sp, sptep); + + if (sp->unsync_children || sp->unsync) + mark_unsync(sptep); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2250,17 +2249,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, mmu_page_zap_pte(kvm, sp, sp->spt + i); } -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(sp, sptep); } @@ -2456,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } -static bool kvm_is_mmio_pfn(pfn_t pfn) +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); @@ -2466,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, - gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { u64 spte; @@ -2544,18 +2538,18 @@ done: return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int write_fault, int *emulate, - int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool host_writable) +static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; + bool emulate = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); - if (is_rmap_spte(*sptep)) { + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2580,12 +2574,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - *emulate = 1; + emulate = true; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (unlikely(is_mmio_spte(*sptep) && emulate)) - *emulate = 1; + if (unlikely(is_mmio_spte(*sptep))) + emulate = true; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2604,9 +2598,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } kvm_release_pfn_clean(pfn); + + return emulate; } -static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, +static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -2638,9 +2634,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, access, 0, NULL, - sp->role.level, gfn, page_to_pfn(pages[i]), - true, true); + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); return 0; } @@ -2688,9 +2683,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int map_writable, int level, gfn_t gfn, pfn_t pfn, - bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, + int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2702,9 +2696,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, &emulate, level, gfn, pfn, - prefault, map_writable); + emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, level, gfn, pfn, prefault, + map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2717,10 +2711,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, base_addr &= PT64_LVL_ADDR_MASK(iterator.level); pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, - 1, ACC_ALL, iterator.sptep); + iterator.level - 1, 1, ACC_ALL); - link_shadow_page(iterator.sptep, sp, true); + link_shadow_page(vcpu, iterator.sptep, sp); } } return emulate; @@ -2739,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_info(SIGBUS, &info, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { /* * Do not cache the mmio info caused by writing the readonly gfn @@ -2759,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, pfn_t *pfnp, int *levelp) + gfn_t *gfnp, kvm_pfn_t *pfnp, + int *levelp) { - pfn_t pfn = *pfnp; + kvm_pfn_t pfn = *pfnp; gfn_t gfn = *gfnp; int level = *levelp; @@ -2800,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - pfn_t pfn, unsigned access, int *ret_val) + kvm_pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; @@ -2899,7 +2893,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * If the mapping has been changed, let the vcpu fault on the * same address again. */ - if (!is_rmap_spte(spte)) { + if (!is_shadow_present_pte(spte)) { ret = true; goto exit; } @@ -2954,7 +2948,7 @@ exit: } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, @@ -2962,14 +2956,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; - pfn_t pfn; + bool force_pt_level = false; + kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +2972,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3000,11 +2992,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, - prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); - return r; out_unlock: @@ -3079,8 +3069,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, - 1, ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3092,9 +3081,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, - PT32_ROOT_LEVEL, 1, ACC_ALL, - NULL); + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3131,7 +3118,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL, NULL); + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3164,9 +3151,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, 0, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3341,7 +3327,7 @@ exit: return reserved; } -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; @@ -3350,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); - if (unlikely(reserved)) + if (WARN_ON(reserved)) return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { @@ -3374,17 +3360,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); - -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) -{ - int ret; - - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret == RET_MMIO_PF_BUG); - return ret; -} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) @@ -3395,7 +3371,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, error_code, true); + r = handle_mmio_page_fault(vcpu, gva, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3427,7 +3403,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; @@ -3435,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -3473,10 +3449,10 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { - pfn_t pfn; + kvm_pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3485,7 +3461,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + r = handle_mmio_page_fault(vcpu, gpa, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3495,20 +3471,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; - else - force_pt_level = 0; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; @@ -3528,8 +3499,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, - level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -3706,7 +3676,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3694,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, @@ -4053,10 +4025,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->inject_page_fault = kvm_inject_page_fault; /* - * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The - * translation of l2_gpa to l1_gpa addresses is done using the - * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa - * functions between mmu and nested_mmu are swapped. + * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using + * L1's nested page tables (e.g. EPT12). The nested translation + * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using + * L2's page tables as the first level of translation and L1's + * nested page tables as the second level of translation. Basically + * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { g_context->nx = false; @@ -4490,7 +4464,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) } /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); /* The caller should hold mmu-lock before calling this function. */ static bool @@ -4584,9 +4558,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) spin_unlock(&kvm->mmu_lock); } -static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +static bool slot_rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { - return __rmap_write_protect(kvm, rmapp, false); + return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -4622,16 +4597,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - unsigned long *rmapp) + struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - pfn_t pfn; + kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e4202e41d535..55ffb7b0f95e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,13 +56,13 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); /* - * Return values of handle_mmio_page_fault_common: + * Return values of handle_mmio_page_fault: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction * directly. * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page * fault path update the mmio spte. * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: bug is detected. + * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). */ enum { RET_MMIO_PF_EMULATE = 1, @@ -71,7 +71,7 @@ enum { RET_MMIO_PF_BUG = -1 }; -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 03d518e499a6..dcce533d420c 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) { struct kvm_mmu_page *sp; gfn_t gfn; - pfn_t pfn; + kvm_pfn_t pfn; hpa_t hpa; sp = page_header(__pa(sptep)); @@ -129,7 +129,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *rev_sp; struct kvm_memslots *slots; struct kvm_memory_slot *slot; @@ -150,8 +150,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot); - if (!*rmapp) { + rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no rmap for writable spte %llx\n", @@ -183,7 +183,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(sp->spt[i])) + if (!is_shadow_present_pte(sp->spt[i])) continue; inspect_spte_has_rmap(kvm, sp->spt + i); @@ -192,7 +192,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; u64 *sptep; struct rmap_iterator iter; struct kvm_memslots *slots; @@ -203,13 +203,14 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); + rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); + } } static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9e8bf13572e6..3f8c732117ec 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -120,14 +120,22 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; } -static u8 mtrr_disabled_type(void) +static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu) { /* * Intel SDM 11.11.2.2: all MTRRs are disabled when * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC * memory type is applied to all of physical memory. + * + * However, virtual machines can be run with CPUID such that + * there are no MTRRs. In that case, the firmware will never + * enable MTRRs and it is obviously undesirable to run the + * guest entirely with UC memory and we use WB. */ - return MTRR_TYPE_UNCACHABLE; + if (guest_cpuid_has_mtrr(vcpu)) + return MTRR_TYPE_UNCACHABLE; + else + return MTRR_TYPE_WRBACK; } /* @@ -267,7 +275,7 @@ static int fixed_mtrr_addr_to_seg(u64 addr) for (seg = 0; seg < seg_num; seg++) { mtrr_seg = &fixed_seg_table[seg]; - if (mtrr_seg->start >= addr && addr < mtrr_seg->end) + if (mtrr_seg->start <= addr && addr < mtrr_seg->end) return seg; } @@ -300,7 +308,6 @@ static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) *start = range->base & PAGE_MASK; mask = range->mask & PAGE_MASK; - mask |= ~0ULL << boot_cpu_data.x86_phys_bits; /* This cannot overflow because writing to the reserved bits of * variable MTRRs causes a #GP. @@ -356,10 +363,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); + /* Extend the mask with all 1 bits to the left, since those + * bits must implicitly be 0. The bits are then cleared + * when reading them. + */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data; + cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -426,6 +437,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + + *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; } return 0; @@ -670,7 +683,7 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } if (iter.mtrr_disabled) - return mtrr_disabled_type(); + return mtrr_disabled_type(vcpu); /* not contained in any MTRRs. */ if (type == -1) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 736e6ab8784d..6c9fed957cce 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, { unsigned pte_access; gfn_t gfn; - pfn_t pfn; + kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; @@ -475,8 +475,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, - gfn, pfn, true, true); + mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + true, true); return true; } @@ -551,12 +551,12 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, emulate = 0; + int top_level, emulate; direct_access = gw->pte_access; @@ -587,7 +587,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access, it.sptep); + false, access); } /* @@ -598,7 +598,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); + link_shadow_page(vcpu, it.sptep, sp); } for (; @@ -617,20 +617,18 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); + true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, - it.level, gw->gfn, pfn, prefault, map_writable); + emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); return emulate; out_gpte_changed: - if (sp) - kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); return 0; } @@ -696,17 +694,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - pfn_t pfn; + kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, addr, error_code, - mmu_is_nested(vcpu)); + r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -743,15 +740,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else - force_pt_level = 1; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else + force_pt_level = true; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f9ed1ff0632..c13a64b7d789 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -86,6 +86,7 @@ static const u32 host_save_user_msrs[] = { MSR_FS_BASE, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_TSC_AUX, }; #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) @@ -135,6 +136,7 @@ struct vcpu_svm { uint64_t asid_generation; uint64_t sysenter_esp; uint64_t sysenter_eip; + uint64_t tsc_aux; u64 next_rip; @@ -158,7 +160,8 @@ struct vcpu_svm { unsigned long int3_rip; u32 apf_reason; - u64 tsc_ratio; + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -211,7 +214,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -891,20 +893,9 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_FFXSR); if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 max; - kvm_has_tsc_control = true; - - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines and a value of 0 is used to disable - * tsc-scaling for the vcpu. - */ - max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); - - kvm_max_guest_tsc_khz = max; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; } if (nested) { @@ -968,68 +959,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 __scale_tsc(u64 ratio, u64 tsc) -{ - u64 mult, frac, _tsc; - - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - - _tsc = tsc; - _tsc *= mult; - _tsc += (tsc >> 32) * frac; - _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return _tsc; -} - -static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 _tsc = tsc; - - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - _tsc = __scale_tsc(svm->tsc_ratio, tsc); - - return _tsc; -} - -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 ratio; - u64 khz; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - svm->tsc_ratio = TSC_RATIO_DEFAULT; - return; - } - - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); - return; - } - - khz = user_tsc_khz; - - /* TSC scaling required - calculate ratio */ - ratio = khz << 32; - do_div(ratio, tsc_khz); - - if (ratio == 0 || ratio & TSC_RATIO_RSVD) { - WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return; - } - svm->tsc_ratio = ratio; -} - static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1056,16 +985,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { struct vcpu_svm *svm = to_svm(vcpu); - if (host) { - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - WARN_ON(adjustment < 0); - adjustment = svm_scale_tsc(vcpu, (u64)adjustment); - } - svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; @@ -1077,16 +1000,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - u64 tsc; - - tsc = svm_scale_tsc(vcpu, rdtsc()); - - return target_tsc - tsc; -} - -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1107,6 +1021,8 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); + set_exception_intercept(svm, AC_VECTOR); + set_exception_intercept(svm, DB_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1157,8 +1073,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1212,7 +1127,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1233,8 +1148,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } - svm->tsc_ratio = TSC_RATIO_DEFAULT; - err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1268,7 +1181,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1320,11 +1233,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && - svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, svm->tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; + if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { + __this_cpu_write(current_tsc_ratio, tsc_ratio); + wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); + } } + /* This assumes that the kernel never uses MSR_TSC_AUX */ + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1642,20 +1560,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_bp_intercept(struct kvm_vcpu *vcpu) +static void update_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - clr_exception_intercept(svm, DB_VECTOR); clr_exception_intercept(svm, BP_VECTOR); - if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else @@ -1761,7 +1672,6 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -1796,6 +1706,12 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } +static int ac_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + return 1; +} + static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1890,7 +1806,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2365,7 +2281,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -3060,7 +2978,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (lapic_in_kernel(&svm->vcpu)) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; @@ -3071,8 +2989,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, host_tsc); + return vmcb->control.tsc_offset + host_tsc; } static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3082,7 +2999,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { msr_info->data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, rdtsc()); + kvm_scale_tsc(vcpu, rdtsc()); break; } @@ -3112,6 +3029,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->sysenter_esp; break; + case MSR_TSC_AUX: + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; + msr_info->data = svm->tsc_aux; + break; /* * Nobody will change the following 5 values in the VMCB so we can * safely return them on rdmsr. They will always be 0 until LBRV is @@ -3141,6 +3063,23 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_UCODE_REV: msr_info->data = 0x01000065; break; + case MSR_F15H_IC_CFG: { + + int family, model; + + family = guest_cpuid_family(vcpu); + model = guest_cpuid_model(vcpu); + + if (family < 0 || model < 0) + return kvm_get_msr_common(vcpu, msr_info); + + msr_info->data = 0; + + if (family == 0x15 && + (model >= 0x2 && model < 0x20)) + msr_info->data = 0x1E; + } + break; default: return kvm_get_msr_common(vcpu, msr_info); } @@ -3233,6 +3172,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp = data; svm->vmcb->save.sysenter_esp = data; break; + case MSR_TSC_AUX: + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; + + /* + * This is rare, so we update the MSR here instead of using + * direct_access_msrs. Doing that would require a rdmsr in + * svm_vcpu_put. + */ + svm->tsc_aux = data; + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", @@ -3294,24 +3245,11 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; } @@ -3371,6 +3309,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, @@ -3522,6 +3461,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; + trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -3659,9 +3600,13 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_vm_has_apicv(struct kvm *kvm) +static bool svm_get_enable_apicv(void) +{ + return false; +} + +static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { - return 0; } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -3754,7 +3699,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -3993,8 +3937,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); - if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_handle_nmi(&svm->vcpu); @@ -4098,6 +4040,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Update nrips enabled cache */ + svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -4134,7 +4080,7 @@ static int svm_get_lpage_level(void) static bool svm_rdtscp_supported(void) { - return false; + return boot_cpu_has(X86_FEATURE_RDTSCP); } static bool svm_invpcid_supported(void) @@ -4376,7 +4322,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .update_db_bp_intercept = update_db_bp_intercept, + .update_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, @@ -4425,7 +4371,8 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .vm_has_apicv = svm_vm_has_apicv, + .get_enable_apicv = svm_get_enable_apicv, + .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, @@ -4448,11 +4395,9 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .set_tsc_khz = svm_set_tsc_khz, .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset = svm_adjust_tsc_offset, - .compute_tsc_offset = svm_compute_tsc_offset, + .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..ad9f6a23f139 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio, ); /* + * Tracepoint for fast mmio. + */ +TRACE_EVENT(kvm_fast_mmio, + TP_PROTO(u64 gpa), + TP_ARGS(gpa), + + TP_STRUCT__entry( + __field(u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = gpa; + ), + + TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) +); + +/* * Tracepoint for cpuid. */ TRACE_EVENT(kvm_cpuid, @@ -250,7 +268,7 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(MC) + EXS(MF), EXS(AC), EXS(MC) /* * Tracepoint for kvm interrupt injection: @@ -974,6 +992,302 @@ TRACE_EVENT(kvm_enter_smm, __entry->smbase) ); +/* + * Tracepoint for VT-d posted-interrupts. + */ +TRACE_EVENT(kvm_pi_irte_update, + TP_PROTO(unsigned int vcpu_id, unsigned int gsi, + unsigned int gvec, u64 pi_desc_addr, bool set), + TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, gsi ) + __field( unsigned int, gvec ) + __field( u64, pi_desc_addr ) + __field( bool, set ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gsi = gsi; + __entry->gvec = gvec; + __entry->pi_desc_addr = pi_desc_addr; + __entry->set = set; + ), + + TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + "gvec: 0x%x, pi_desc_addr: 0x%llx", + __entry->set ? "enabled and being updated" : "disabled", + __entry->vcpu_id, + __entry->gsi, + __entry->gvec, + __entry->pi_desc_addr) +); + +/* + * Tracepoint for kvm_hv_notify_acked_sint. + */ +TRACE_EVENT(kvm_hv_notify_acked_sint, + TP_PROTO(int vcpu_id, u32 sint), + TP_ARGS(vcpu_id, sint), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->sint = sint; + ), + + TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint) +); + +/* + * Tracepoint for synic_set_irq. + */ +TRACE_EVENT(kvm_hv_synic_set_irq, + TP_PROTO(int vcpu_id, u32 sint, int vector, int ret), + TP_ARGS(vcpu_id, sint, vector, ret), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + __field(int, vector) + __field(int, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->sint = sint; + __entry->vector = vector; + __entry->ret = ret; + ), + + TP_printk("vcpu_id %d sint %u vector %d ret %d", + __entry->vcpu_id, __entry->sint, __entry->vector, + __entry->ret) +); + +/* + * Tracepoint for kvm_hv_synic_send_eoi. + */ +TRACE_EVENT(kvm_hv_synic_send_eoi, + TP_PROTO(int vcpu_id, int vector), + TP_ARGS(vcpu_id, vector), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + __field(int, vector) + __field(int, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vector = vector; + ), + + TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector) +); + +/* + * Tracepoint for synic_set_msr. + */ +TRACE_EVENT(kvm_hv_synic_set_msr, + TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host), + TP_ARGS(vcpu_id, msr, data, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, msr) + __field(u64, data) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->msr = msr; + __entry->data = data; + __entry->host = host + ), + + TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d", + __entry->vcpu_id, __entry->msr, __entry->data, __entry->host) +); + +/* + * Tracepoint for stimer_set_config. + */ +TRACE_EVENT(kvm_hv_stimer_set_config, + TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host), + TP_ARGS(vcpu_id, timer_index, config, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, config) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->config = config; + __entry->host = host; + ), + + TP_printk("vcpu_id %d timer %d config 0x%llx host %d", + __entry->vcpu_id, __entry->timer_index, __entry->config, + __entry->host) +); + +/* + * Tracepoint for stimer_set_count. + */ +TRACE_EVENT(kvm_hv_stimer_set_count, + TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host), + TP_ARGS(vcpu_id, timer_index, count, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, count) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->count = count; + __entry->host = host; + ), + + TP_printk("vcpu_id %d timer %d count %llu host %d", + __entry->vcpu_id, __entry->timer_index, __entry->count, + __entry->host) +); + +/* + * Tracepoint for stimer_start(periodic timer case). + */ +TRACE_EVENT(kvm_hv_stimer_start_periodic, + TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time), + TP_ARGS(vcpu_id, timer_index, time_now, exp_time), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, time_now) + __field(u64, exp_time) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->time_now = time_now; + __entry->exp_time = exp_time; + ), + + TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu", + __entry->vcpu_id, __entry->timer_index, __entry->time_now, + __entry->exp_time) +); + +/* + * Tracepoint for stimer_start(one-shot timer case). + */ +TRACE_EVENT(kvm_hv_stimer_start_one_shot, + TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count), + TP_ARGS(vcpu_id, timer_index, time_now, count), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, time_now) + __field(u64, count) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->time_now = time_now; + __entry->count = count; + ), + + TP_printk("vcpu_id %d timer %d time_now %llu count %llu", + __entry->vcpu_id, __entry->timer_index, __entry->time_now, + __entry->count) +); + +/* + * Tracepoint for stimer_timer_callback. + */ +TRACE_EVENT(kvm_hv_stimer_callback, + TP_PROTO(int vcpu_id, int timer_index), + TP_ARGS(vcpu_id, timer_index), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + ), + + TP_printk("vcpu_id %d timer %d", + __entry->vcpu_id, __entry->timer_index) +); + +/* + * Tracepoint for stimer_expiration. + */ +TRACE_EVENT(kvm_hv_stimer_expiration, + TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), + TP_ARGS(vcpu_id, timer_index, msg_send_result), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(int, msg_send_result) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->msg_send_result = msg_send_result; + ), + + TP_printk("vcpu_id %d timer %d msg send result %d", + __entry->vcpu_id, __entry->timer_index, + __entry->msg_send_result) +); + +/* + * Tracepoint for stimer_cleanup. + */ +TRACE_EVENT(kvm_hv_stimer_cleanup, + TP_PROTO(int vcpu_id, int timer_index), + TP_ARGS(vcpu_id, timer_index), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + ), + + TP_printk("vcpu_id %d timer %d", + __entry->vcpu_id, __entry->timer_index) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a8bc64566ab..e2951b6edbbc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" #include "cpuid.h" +#include "lapic.h" #include <linux/kvm_host.h> #include <linux/module.h> @@ -35,6 +36,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include <asm/cpu.h> #include <asm/io.h> #include <asm/desc.h> #include <asm/vmx.h> @@ -45,6 +47,7 @@ #include <asm/debugreg.h> #include <asm/kexec.h> #include <asm/apic.h> +#include <asm/irq_remapping.h> #include "trace.h" #include "pmu.h" @@ -105,6 +108,8 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -424,6 +429,9 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u16 vpid02; + u16 last_vpid; + u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; u32 nested_vmx_true_procbased_ctls_low; @@ -440,14 +448,33 @@ struct nested_vmx { u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; }; #define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ - u32 control; /* bit 0 of control is outstanding notification bit */ - u32 rsvd[7]; + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; } __aligned(64); static bool pi_test_and_set_on(struct pi_desc *pi_desc) @@ -467,6 +494,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -532,8 +583,6 @@ struct vcpu_vmx { s64 vnmi_blocked_time; u32 exit_reason; - bool rdtscp_enabled; - /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -563,6 +612,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -809,7 +863,6 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -817,7 +870,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); @@ -831,6 +883,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; @@ -946,9 +1005,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void) return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_secondary_exec_ctrls(void) @@ -983,7 +1042,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) static inline bool cpu_has_vmx_posted_intr(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_vmx_apicv(void) @@ -1062,9 +1122,9 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { - return flexpriority_enabled && irqchip_in_kernel(kvm); + return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_vpid(void) @@ -1113,6 +1173,12 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1157,6 +1223,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); @@ -1337,13 +1408,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(int vpid) { - if (vmx->vpid == 0) + if (vpid == 0) return; if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) @@ -1352,10 +1423,10 @@ static inline void vpid_sync_vcpu_global(void) __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } -static inline void vpid_sync_context(struct vcpu_vmx *vmx) +static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); + vpid_sync_vcpu_single(vpid); else vpid_sync_vcpu_global(); } @@ -1376,7 +1447,51 @@ static inline void ept_sync_context(u64 eptp) } } -static __always_inline unsigned long vmcs_readl(unsigned long field) +static __always_inline void vmcs_check16(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "16-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "16-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "16-bit accessor invalid for 32-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "16-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check32(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "32-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check64(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "64-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "64-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "64-bit accessor invalid for 32-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "64-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_checkl(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "Natural width accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "Natural width accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "Natural width accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "Natural width accessor invalid for 32-bit field"); +} + +static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; @@ -1387,23 +1502,32 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) static __always_inline u16 vmcs_read16(unsigned long field) { - return vmcs_readl(field); + vmcs_check16(field); + return __vmcs_readl(field); } static __always_inline u32 vmcs_read32(unsigned long field) { - return vmcs_readl(field); + vmcs_check32(field); + return __vmcs_readl(field); } static __always_inline u64 vmcs_read64(unsigned long field) { + vmcs_check64(field); #ifdef CONFIG_X86_64 - return vmcs_readl(field); + return __vmcs_readl(field); #else - return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); + return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); #endif } +static __always_inline unsigned long vmcs_readl(unsigned long field) +{ + vmcs_checkl(field); + return __vmcs_readl(field); +} + static noinline void vmwrite_error(unsigned long field, unsigned long value) { printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", @@ -1411,7 +1535,7 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) dump_stack(); } -static void vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { u8 error; @@ -1421,33 +1545,46 @@ static void vmcs_writel(unsigned long field, unsigned long value) vmwrite_error(field, value); } -static void vmcs_write16(unsigned long field, u16 value) +static __always_inline void vmcs_write16(unsigned long field, u16 value) { - vmcs_writel(field, value); + vmcs_check16(field); + __vmcs_writel(field, value); } -static void vmcs_write32(unsigned long field, u32 value) +static __always_inline void vmcs_write32(unsigned long field, u32 value) { - vmcs_writel(field, value); + vmcs_check32(field); + __vmcs_writel(field, value); } -static void vmcs_write64(unsigned long field, u64 value) +static __always_inline void vmcs_write64(unsigned long field, u64 value) { - vmcs_writel(field, value); + vmcs_check64(field); + __vmcs_writel(field, value); #ifndef CONFIG_X86_64 asm volatile (""); - vmcs_writel(field+1, value >> 32); + __vmcs_writel(field+1, value >> 32); #endif } -static void vmcs_clear_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +{ + vmcs_checkl(field); + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { - vmcs_writel(field, vmcs_readl(field) & ~mask); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_clear_bits does not support 64-bit fields"); + __vmcs_writel(field, __vmcs_readl(field) & ~mask); } -static void vmcs_set_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { - vmcs_writel(field, vmcs_readl(field) | mask); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_set_bits does not support 64-bit fields"); + __vmcs_writel(field, __vmcs_readl(field) | mask); } static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) @@ -1567,7 +1704,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR); + (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -1895,6 +2032,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) preempt_enable(); } +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + /* + * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there + * are two possible cases: + * 1. After running 'pre_block', context switch + * happened. For this case, 'sn' was set in + * vmx_vcpu_put(), so we need to clear it here. + * 2. After running 'pre_block', we were blocked, + * and woken up by some other guy. For this case, + * we don't need to do anything, 'pi_post_block' + * will do everything for us. However, we cannot + * check whether it is case #1 or case #2 here + * (maybe, not needed), so we also clear sn here, + * I think it is not a big deal. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { + if (vcpu->cpu != cpu) { + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + } + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +} /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1943,12 +2126,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* Setup TSC multiplier */ + if (cpu_has_vmx_tsc_scaling()) + vmcs_write64(TSC_MULTIPLIER, + vcpu->arch.tsc_scaling_ratio); + vmx->loaded_vmcs->cpu = cpu; } + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_vcpu_pi_put(vcpu); + __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); @@ -2207,7 +2413,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) + if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2230,15 +2436,16 @@ static void setup_msrs(struct vcpu_vmx *vmx) /* * reads and returns guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset -- 21.3 + * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset + * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 */ -static u64 guest_read_tsc(void) +static u64 guest_read_tsc(struct kvm_vcpu *vcpu) { u64 host_tsc, tsc_offset; host_tsc = rdtsc(); tsc_offset = vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; + return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; } /* @@ -2255,22 +2462,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -/* - * Engage any workarounds for mis-matched TSC rates. Currently limited to - * software catchup for faster rates on slower CPUs. - */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - if (!scale) - return; - - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); -} - static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) { return vmcs_read64(TSC_OFFSET); @@ -2302,7 +2493,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { u64 offset = vmcs_read64(TSC_OFFSET); @@ -2315,11 +2506,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho offset + adjustment); } -static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - return target_tsc - rdtsc(); -} - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -2377,7 +2563,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (kvm_vcpu_apicv_active(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -2471,10 +2657,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_PCOMMIT; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -2493,6 +2681,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + else + vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -2608,7 +2802,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = vmx->nested.nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps | + ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; default: return 1; @@ -2642,7 +2837,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(); + msr_info->data = guest_read_tsc(vcpu); break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); @@ -2673,7 +2868,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; /* Otherwise falls through */ default: @@ -2779,7 +2974,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -2874,6 +3069,8 @@ static int hardware_enable(void) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); /* * Now we can enable the vmclear operation in kdump @@ -3015,7 +3212,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML; + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_PCOMMIT | + SECONDARY_EXEC_TSC_SCALING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3441,9 +3640,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(to_vmx(vcpu)); + vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -3451,6 +3650,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) } } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -3644,20 +3848,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; - /* - * SMEP/SMAP is disabled if CPU is in non-paging mode - * in hardware. However KVM always uses paging mode to - * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP/SMAP needs to be - * manually disabled when guest switches to non-paging - * mode. - */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } + if (!enable_unrestricted_guest && !is_paging(vcpu)) + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode without + * unrestricted guest. + * To emulate this behavior, SMEP/SMAP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); return 0; @@ -4046,7 +4251,7 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { int i, idx, r = 0; - pfn_t identity_map_pfn; + kvm_pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) @@ -4146,29 +4351,28 @@ static int alloc_identity_pagetable(struct kvm *kvm) return r; } -static void allocate_vpid(struct vcpu_vmx *vmx) +static int allocate_vpid(void) { int vpid; - vmx->vpid = 0; if (!enable_vpid) - return; + return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; + if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); - } + else + vpid = 0; spin_unlock(&vmx_vpid_lock); + return vpid; } -static void free_vpid(struct vcpu_vmx *vmx) +static void free_vpid(int vpid) { - if (!enable_vpid) + if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); + __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } @@ -4323,9 +4527,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_vm_has_apicv(struct kvm *kvm) +static bool vmx_get_enable_apicv(void) { - return enable_apicv && irqchip_in_kernel(kvm); + return enable_apicv; } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4369,6 +4573,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently, we don't support urgent interrupt, + * all interrupts are recognized as non-urgent + * interrupt, so we cannot post interrupts when + * 'SN' is set. + * + * If the vcpu is in guest mode, it means it is + * running instead of being scheduled out and + * waiting in the run queue, and that's the only + * case when 'SN' is set currently, warning if + * 'SN' is set. + */ + WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); return true; @@ -4431,11 +4651,6 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } -static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) -{ - return; -} - /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4505,11 +4720,18 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } +static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4517,7 +4739,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | @@ -4534,7 +4756,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -4548,7 +4770,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4558,8 +4780,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - /* PML is enabled/disabled in creating/destorying vcpu */ - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (!enable_pml) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + /* Currently, we allow L1 guest to directly run pcommit instruction. */ + exec_control &= ~SECONDARY_EXEC_PCOMMIT; return exec_control; } @@ -4604,12 +4830,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { + if (cpu_has_secondary_exec_ctrls()) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - } - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4617,7 +4842,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); - vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } @@ -4709,7 +4934,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_write32(GUEST_CS_BASE, 0xffff0000); + vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -4745,7 +4970,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); setup_msrs(vmx); @@ -4753,7 +4978,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vcpu->kvm)) + if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); @@ -4761,7 +4986,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_vm_has_apicv(vcpu->kvm)) + if (kvm_vcpu_apicv_active(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -4771,12 +4996,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); - vpid_sync_context(vmx); + vpid_sync_context(vmx->vpid); } /* @@ -5104,6 +5328,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { + case AC_VECTOR: + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & @@ -5296,7 +5523,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return 1; if (cr8_prev <= cr8) return 1; @@ -5510,17 +5737,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } return 1; } @@ -5753,10 +5969,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); + trace_kvm_fast_mmio(gpa); return 1; } - ret = handle_mmio_page_fault_common(vcpu, gpa, true); + ret = handle_mmio_page_fault(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; @@ -5910,6 +6127,25 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6028,13 +6264,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->hwapic_isr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + if (cpu_has_vmx_tsc_scaling()) { + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; } vmx_disable_intercept_for_msr(MSR_FS_BASE, false); @@ -6096,6 +6329,8 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + return alloc_kvm_area(); out8: @@ -6627,7 +6862,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { - u32 exec_control; if (vmx->nested.current_vmptr == -1ull) return; @@ -6640,9 +6874,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; @@ -6662,6 +6895,7 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); @@ -7038,7 +7272,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; - u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7070,9 +7303,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; @@ -7178,7 +7410,58 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + int vpid; + + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, + sizeof(u32), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_VPID_EXTENT_ALL_CONTEXT: + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invvpid calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); return 1; } @@ -7207,6 +7490,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_pcommit(struct kvm_vcpu *vcpu) +{ + /* we never catch pcommit instruct for L1 guest. */ + WARN_ON(1); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7257,6 +7547,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PCOMMIT] = handle_pcommit, }; static const int kvm_vmx_max_exit_handlers = @@ -7558,6 +7849,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PCOMMIT: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); default: return true; } @@ -7569,10 +7862,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_enable_pml(struct vcpu_vmx *vmx) +static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) { struct page *pml_pg; - u32 exec_control; pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pml_pg) @@ -7583,24 +7875,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - return 0; } -static void vmx_disable_pml(struct vcpu_vmx *vmx) +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { - u32 exec_control; - - ASSERT(vmx->pml_pg); - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) @@ -7676,7 +7959,7 @@ static void dump_vmcs(void) u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); u32 secondary_exec_control = 0; unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_readl(GUEST_IA32_EFER); + u64 efer = vmcs_read64(GUEST_IA32_EFER); int i, n; if (cpu_has_secondary_exec_ctrls()) @@ -7692,10 +7975,10 @@ static void dump_vmcs(void) if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) { - pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", - vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); - pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", - vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); } pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); @@ -7716,16 +7999,16 @@ static void dump_vmcs(void) vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", - efer, vmcs_readl(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", - vmcs_readl(GUEST_IA32_DEBUGCTL), + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + efer, vmcs_read64(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016lx\n", - vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), vmcs_read32(GUEST_ACTIVITY_STATE)); @@ -7754,11 +8037,12 @@ static void dump_vmcs(void) vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", - vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + vmcs_read64(HOST_IA32_EFER), + vmcs_read64(HOST_IA32_PAT)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016lx\n", - vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", @@ -7781,13 +8065,16 @@ static void dump_vmcs(void) pr_err("IDTVectoring: info=%08x errcode=%08x\n", vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); n = vmcs_read32(CR3_TARGET_COUNT); for (i = 0; i + 1 < n; i += 4) pr_err("CR3 target%u=%016lx target%u=%016lx\n", @@ -7814,6 +8101,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -7924,10 +8213,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_vm_has_apicv(vcpu->kvm)) + !kvm_vcpu_apicv_active(vcpu)) return; - if (!vm_need_tpr_shadow(vcpu->kvm)) + if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -8031,7 +8320,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { - if (!vmx_vm_has_apicv(vcpu->kvm)) + if (!kvm_vcpu_apicv_active(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8439,7 +8728,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if @@ -8477,8 +8765,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) - vmx_disable_pml(vmx); - free_vpid(vmx); + vmx_destroy_pml_buffer(vmx); + free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); free_nested(vmx); @@ -8497,7 +8785,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - allocate_vpid(vmx); + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -8530,7 +8818,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) { + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -8545,8 +8833,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) + if (nested) { nested_vmx_setup_ctls_msrs(vmx); + vmx->nested.vpid02 = allocate_vpid(); + } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8559,7 +8849,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - err = vmx_enable_pml(vmx); + err = vmx_create_pml_buffer(vmx); if (err) goto free_vmcs; } @@ -8567,13 +8857,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: + free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); + free_vpid(vmx->vpid); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -8648,49 +8939,68 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); + if (!rdtscp_enabled) + secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; + + if (nested) { + if (rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; } - if (nested && !vmx->rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; } /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ebx & bit(X86_FEATURE_INVPCID)) && - guest_cpuid_has_pcid(vcpu)) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } else { - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || + !guest_cpuid_has_pcid(vcpu))) { + secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; + if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(secondary_exec_ctl); + + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { + if (guest_cpuid_has_pcommit(vcpu)) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_PCOMMIT; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_PCOMMIT; + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9257,7 +9567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, page_to_phys(vmx->nested.pi_desc_page) + (unsigned long)(vmcs12->posted_intr_desc_addr & @@ -9298,13 +9608,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_PCOMMIT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -9323,7 +9633,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); @@ -9433,12 +9743,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (enable_vpid) { /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + } + } else { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + } if (nested_cpu_has_ept(vmcs12)) { @@ -9906,7 +10228,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { - vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); @@ -10278,6 +10600,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + + do { + old.control = new.control = pi_desc->control; + + /* + * We should not block the vCPU if + * an interrupt is posted for it. + */ + if (pi_test_on(pi_desc) == 1) { + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + + return 1; + } + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + return 0; +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + continue; + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else { + /* suppress notification event before unposting */ + pi_set_sn(vcpu_to_pi_desc(vcpu)); + ret = irq_set_vcpu_affinity(host_irq, NULL); + pi_clear_sn(vcpu_to_pi_desc(vcpu)); + } + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -10297,7 +10814,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_db_bp_intercept = update_exception_bitmap, + .update_bp_intercept = update_exception_bitmap, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -10347,7 +10864,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .vm_has_apicv = vmx_vm_has_apicv, + .get_enable_apicv = vmx_get_enable_apicv, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, @@ -10371,11 +10889,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .set_tsc_khz = vmx_set_tsc_khz, .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset = vmx_adjust_tsc_offset, - .compute_tsc_offset = vmx_compute_tsc_offset, + .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, @@ -10394,7 +10910,12 @@ static struct kvm_x86_ops vmx_x86_ops = { .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a9a19830321..4244c2baf57d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,8 @@ #include <linux/pci.h> #include <linux/timekeeper_internal.h> #include <linux/pvclock_gtod.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -64,6 +66,7 @@ #include <asm/fpu/internal.h> /* Ugh! */ #include <asm/pvclock.h> #include <asm/div64.h> +#include <asm/irq_remapping.h> #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 @@ -90,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -struct kvm_x86_ops *kvm_x86_ops; +struct kvm_x86_ops *kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); -static bool ignore_msrs = 0; +static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; @@ -102,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool kvm_has_tsc_control; +bool __read_mostly kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 kvm_max_guest_tsc_khz; +u32 __read_mostly kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; +EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); +u64 __read_mostly kvm_max_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); +static u64 __read_mostly kvm_default_tsc_scaling_ratio; /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ -static u32 tsc_tolerance_ppm = 250; +static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int lapic_timer_advance_ns = 0; +unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -static bool backwards_tsc_observed = false; +static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -622,7 +630,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); - if ((cr0 ^ old_cr0) & X86_CR0_CD) + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; @@ -663,9 +673,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - if (!(xcr0 & XSTATE_FP)) + if (!(xcr0 & XFEATURE_MASK_FP)) return 1; - if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) return 1; /* @@ -673,23 +683,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see fx_init). */ - valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; - if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != + (!(xcr0 & XFEATURE_MASK_BNDCSR))) return 1; - if (xcr0 & XSTATE_AVX512) { - if (!(xcr0 & XSTATE_YMM)) + if (xcr0 & XFEATURE_MASK_AVX512) { + if (!(xcr0 & XFEATURE_MASK_YMM)) return 1; - if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) + if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; - if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) kvm_update_cpuid(vcpu); return 0; } @@ -788,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; @@ -798,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; @@ -940,7 +951,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, }; static unsigned num_msrs_to_save; @@ -952,6 +963,11 @@ static u32 emulated_msrs[] = { HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1153,7 +1169,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) ++version; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) + return; /* * The guest calculates current wall clock time by adding @@ -1240,14 +1257,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } -static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) +{ + u64 ratio; + + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + return 0; + } + + /* TSC scaling supported? */ + if (!kvm_has_tsc_control) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + return 0; + } else { + WARN(1, "user requested TSC rate below hardware speed\n"); + return -1; + } + } + + /* TSC scaling required - calculate ratio */ + ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + user_tsc_khz, tsc_khz); + + if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return -1; + } + + vcpu->arch.tsc_scaling_ratio = ratio; + return 0; +} + +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) - return; + if (this_tsc_khz == 0) { + /* set tsc_scaling_ratio to a safe value */ + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + return -1; + } /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, @@ -1267,7 +1323,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -1313,6 +1369,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } +/* + * Multiply tsc by a fixed point number represented by ratio. + * + * The most significant 64-N bits (mult) of ratio represent the + * integral part of the fixed point number; the remaining N bits + * (frac) represent the fractional part, ie. ratio represents a fixed + * point number (mult + frac * 2^(-N)). + * + * N equals to kvm_tsc_scaling_ratio_frac_bits. + */ +static inline u64 __scale_tsc(u64 ratio, u64 tsc) +{ + return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); +} + +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + u64 _tsc = tsc; + u64 ratio = vcpu->arch.tsc_scaling_ratio; + + if (ratio != kvm_default_tsc_scaling_ratio) + _tsc = __scale_tsc(ratio, tsc); + + return _tsc; +} +EXPORT_SYMBOL_GPL(kvm_scale_tsc); + +static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = kvm_scale_tsc(vcpu, rdtsc()); + + return target_tsc - tsc; +} + +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +{ + return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); +} +EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1324,7 +1422,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1381,7 +1479,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; @@ -1438,6 +1536,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) EXPORT_SYMBOL_GPL(kvm_write_tsc); +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + WARN_ON(adjustment < 0); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + #ifdef CONFIG_X86_64 static cycle_t read_tsc(void) @@ -1574,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +void kvm_make_mclock_inprogress_request(struct kvm *kvm) +{ + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); +} + static void kvm_gen_update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 @@ -1599,7 +1716,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz; + unsigned long flags, this_tsc_khz, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1636,7 +1753,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = get_kernel_ns(); } - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock @@ -1662,7 +1779,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + tgt_tsc_khz = kvm_has_tsc_control ? + vcpu->virtual_tsc_khz : this_tsc_khz; + kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = this_tsc_khz; @@ -1897,6 +2016,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { + accumulate_steal_time(vcpu); + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2047,12 +2168,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & KVM_MSR_ENABLED)) break; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; @@ -2091,6 +2206,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -2295,6 +2411,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data); break; @@ -2434,6 +2551,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2448,6 +2566,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: + case KVM_CAP_SPLIT_IRQCHIP: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2585,6 +2704,11 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2611,7 +2735,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; @@ -2627,7 +2751,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; } - accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } @@ -2641,7 +2764,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_x86_ops->sync_pir_to_irr(vcpu); + if (vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0; @@ -2656,17 +2781,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, return 0; } +static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) +{ + return (!lapic_in_kernel(vcpu) || + kvm_apic_accept_pic_intr(vcpu)); +} + +/* + * if userspace requested an interrupt window, check that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) +{ + return kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); +} + static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) + + if (!irqchip_in_kernel(vcpu->kvm)) { + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + + /* + * With in-kernel LAPIC, we only use this to inject EXTINT, so + * fail for in-kernel 8259. + */ + if (pic_in_kernel(vcpu->kvm)) return -ENXIO; - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (vcpu->arch.pending_external_vector != -1) + return -EEXIST; + vcpu->arch.pending_external_vector = irq->irq; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -2905,7 +3063,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) * Copy each region from the possibly compacted offset to the * non-compacted offset. */ - valid = xstate_bv & ~XSTATE_FPSSE; + valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u64 feature = valid & -valid; int index = fls64(feature) - 1; @@ -2943,7 +3101,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) * Copy each region from the non-compacted offset to the * possibly compacted offset. */ - valid = xstate_bv & ~XSTATE_FPSSE; + valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u64 feature = valid & -valid; int index = fls64(feature) - 1; @@ -2971,7 +3129,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, &vcpu->arch.guest_fpu.state.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XSTATE_FPSSE; + XFEATURE_MASK_FPSSE; } } @@ -2991,7 +3149,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XSTATE_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); @@ -3051,6 +3209,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) return 0; } +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC: + return kvm_hv_activate_synic(vcpu); + default: + return -EINVAL; + } +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3175,7 +3347,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vapic_addr va; r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) @@ -3302,9 +3474,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; - kvm_set_tsc_khz(vcpu, user_tsc_khz); + if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) + r = 0; - r = 0; goto out; } case KVM_GET_TSC_KHZ: { @@ -3315,6 +3487,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_set_guest_paused(vcpu); goto out; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } default: r = -EINVAL; } @@ -3424,41 +3605,38 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - + int i; mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + for (i = 0; i < 3; i++) + kvm_pit_load_count(kvm, i, ps->channels[i].count, 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); memset(&ps->reserved, 0, sizeof(ps->reserved)); - return r; + return 0; } static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0, start = 0; + int start = 0; + int i; u32 prev_legacy, cur_legacy; mutex_lock(&kvm->arch.vpit->pit_state.lock); prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; @@ -3468,9 +3646,11 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, sizeof(kvm->arch.vpit->pit_state.channels)); kvm->arch.vpit->pit_state.flags = ps->flags; - kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); + for (i = 0; i < 3; i++) + kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, + start && i == 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, @@ -3555,6 +3735,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; + case KVM_CAP_SPLIT_IRQCHIP: { + mutex_lock(&kvm->lock); + r = -EINVAL; + if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + goto split_irqchip_unlock; + r = -EEXIST; + if (irqchip_in_kernel(kvm)) + goto split_irqchip_unlock; + if (atomic_read(&kvm->online_vcpus)) + goto split_irqchip_unlock; + r = kvm_setup_empty_irq_routing(kvm); + if (r) + goto split_irqchip_unlock; + /* Pairs with irqchip_in_kernel. */ + smp_wmb(); + kvm->arch.irqchip_split = true; + kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + r = 0; +split_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -3668,7 +3870,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -3692,7 +3894,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -3845,16 +4047,17 @@ static void kvm_init_msr_list(void) /* * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. We could work around this - * in VMX with the generic MSR save/load machinery, but it - * is not really worthwhile since it will really only - * happen with nested virtualization. + * to the guests in some cases. */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: if (!kvm_x86_ops->mpx_supported()) continue; break; + case MSR_TSC_AUX: + if (!kvm_x86_ops->rdtscp_supported()) + continue; + break; default: break; } @@ -4059,6 +4262,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } +static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); + + return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; +} + int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4794,6 +5006,7 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, + .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -4935,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, int emulation_type) { gpa_t gpa = cr2; - pfn_t pfn; + kvm_pfn_t pfn; if (emulation_type & EMULTYPE_NO_REEXECUTE) return false; @@ -5666,7 +5879,7 @@ void kvm_arch_exit(void) int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { + if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; } else { @@ -5701,6 +5914,12 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } +void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) +{ + vcpu->arch.apicv_active = false; + kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5765,17 +5984,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); + return vcpu->run->request_interrupt_window && + likely(!pic_in_kernel(vcpu->kvm)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5786,13 +5998,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else - kvm_run->ready_for_interrupt_injection = - kvm_arch_interrupt_allowed(vcpu) && - !kvm_cpu_has_interrupt(vcpu) && - !kvm_event_needs_reinjection(vcpu); + kvm_run->ready_for_interrupt_injection = + pic_in_kernel(vcpu->kvm) || + kvm_vcpu_ready_for_interrupt_injection(vcpu); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -5805,6 +6013,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; + if (vcpu->arch.apicv_active) + return; + if (!vcpu->arch.apic->vapic_addr) max_irr = kvm_lapic_find_highest_irr(vcpu); else @@ -6141,20 +6352,30 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } +void kvm_make_scan_ioapic_request(struct kvm *kvm) +{ + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); +} + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; - u32 tmr[8]; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(eoi_exit_bitmap, 0, 32); - memset(tmr, 0, 32); + bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); + if (irqchip_split(vcpu->kvm)) + kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); + else { + if (vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + } + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, + vcpu_to_synic(vcpu)->vec_bitmap, 256); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); - kvm_apic_update_tmr(vcpu, tmr); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) @@ -6167,7 +6388,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops->set_apic_access_page_addr) @@ -6205,8 +6426,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window; + bool req_int_win = + dm_request_for_irq_injection(vcpu) && + kvm_cpu_accept_dm_intr(vcpu); + bool req_immediate_exit = false; if (vcpu->requests) { @@ -6257,6 +6480,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { + BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); + if (test_bit(vcpu->arch.pending_ioapic_eoi, + vcpu->arch.ioapic_handled_vectors)) { + vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; + vcpu->run->eoi.vector = + vcpu->arch.pending_ioapic_eoi; + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) @@ -6267,6 +6501,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv = vcpu->arch.hyperv.exit; + r = 0; + goto out; + } + + /* + * KVM_REQ_HV_STIMER has to be processed after + * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers + * depend on the guest clock being up-to-date + */ + if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) + kvm_hv_process_stimers(vcpu); + } + + /* + * KVM_REQ_EVENT is not set when posted interrupts are set by + * VT-d hardware, so we have to update RVI unconditionally. + */ + if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (vcpu->arch.apicv_active) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -6285,13 +6553,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->hwapic_irr_update) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -6334,6 +6595,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); + trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); __kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { @@ -6346,8 +6609,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* @@ -6375,8 +6636,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, - rdtsc()); + vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -6427,10 +6687,15 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu)) { + if (!kvm_arch_vcpu_runnable(vcpu) && + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_x86_ops->post_block) + kvm_x86_ops->post_block(vcpu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -6467,10 +6732,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); for (;;) { - if (kvm_vcpu_running(vcpu)) + if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); - else + } else { r = vcpu_block(kvm, vcpu); + } + if (r <= 0) break; @@ -6478,9 +6745,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); - if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + if (dm_request_for_irq_injection(vcpu) && + kvm_vcpu_ready_for_interrupt_injection(vcpu)) { + r = 0; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } @@ -6607,7 +6875,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { + if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { r = -EINVAL; goto out; @@ -6931,7 +7199,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_db_bp_intercept(vcpu); + kvm_x86_ops->update_bp_intercept(vcpu); r = 0; @@ -7005,7 +7273,7 @@ static void fx_init(struct kvm_vcpu *vcpu) /* * Ensure guest xcr0 is valid for loading */ - vcpu->arch.xcr0 = XSTATE_FP; + vcpu->arch.xcr0 = XFEATURE_MASK_FP; vcpu->arch.cr0 |= X86_CR0_ET; } @@ -7280,6 +7548,20 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); + kvm_max_guest_tsc_khz = max; + + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + } + kvm_init_msr_list(); return 0; } @@ -7307,7 +7589,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); + return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); } struct static_key kvm_no_apic_vcpu __read_mostly; @@ -7321,6 +7603,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) @@ -7376,6 +7659,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + vcpu->arch.pending_external_vector = -1; + + kvm_hv_vcpu_init(vcpu); + return 0; fail_free_mce_banks: @@ -7394,6 +7681,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); @@ -7401,7 +7689,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } @@ -7788,6 +8076,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_cpu_has_interrupt(vcpu)) return true; + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + return false; } @@ -8028,7 +8319,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (kvm_x86_ops->update_pi_irte) { + irqfd->producer = prod; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + } + + return -EINVAL; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (!kvm_x86_ops->update_pi_irte) { + WARN_ON(irqfd->producer != NULL); + return; + } + + WARN_ON(irqfd->producer != prod); + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * remapped mode, so we can re-use the current implementation + * when the irq is masked/disabed or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ + ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); +} + +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + if (!kvm_x86_ops->update_pi_irte) + return -EINVAL; + + return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); @@ -8043,3 +8386,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2f822cd886c2..f2afa5fe48a6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -180,9 +180,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR \ - | XSTATE_AVX512) +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a0d09f6c6533..4ba229ac3f4f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,6 +1414,7 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; + pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These @@ -1472,7 +1473,6 @@ __init void lguest_init(void) pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; pv_mmu_ops.pte_update = lguest_pte_update; - pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC /* APIC read/write intercepts */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2587888d987..a501fa25da41 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,7 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o -lib-y := delay.o misc.o cmdline.o +lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c new file mode 100644 index 000000000000..aa417a97511c --- /dev/null +++ b/arch/x86/lib/cpu.c @@ -0,0 +1,35 @@ +#include <linux/module.h> + +unsigned int x86_family(unsigned int sig) +{ + unsigned int x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} +EXPORT_SYMBOL_GPL(x86_family); + +unsigned int x86_model(unsigned int sig) +{ + unsigned int fam, model; + + fam = x86_family(sig); + + model = (sig >> 4) & 0xf; + + if (fam >= 0x6) + model += ((sig >> 16) & 0xf) << 4; + + return model; +} +EXPORT_SYMBOL_GPL(x86_model); + +unsigned int x86_stepping(unsigned int sig) +{ + return sig & 0xf; +} +EXPORT_SYMBOL_GPL(x86_stepping); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 43623739c7cf..004c861b1648 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,6 +1,8 @@ #include <linux/module.h> #include <linux/preempt.h> #include <asm/msr.h> +#define CREATE_TRACE_POINTS +#include <asm/msr-trace.h> struct msr *msrs_alloc(void) { @@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } + +#ifdef CONFIG_TRACEPOINTS +void do_trace_write_msr(unsigned msr, u64 val, int failed) +{ + trace_write_msr(msr, val, failed); +} +EXPORT_SYMBOL(do_trace_write_msr); +EXPORT_TRACEPOINT_SYMBOL(write_msr); + +void do_trace_read_msr(unsigned msr, u64 val, int failed) +{ + trace_read_msr(msr, val, failed); +} +EXPORT_SYMBOL(do_trace_read_msr); +EXPORT_TRACEPOINT_SYMBOL(read_msr); + +void do_trace_rdpmc(unsigned counter, u64 val, int failed) +{ + trace_rdpmc(counter, val, failed); +} +EXPORT_SYMBOL(do_trace_rdpmc); +EXPORT_TRACEPOINT_SYMBOL(rdpmc); + +#endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 816488c0b97e..d388de72eaca 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -353,8 +353,12 @@ AVXcode: 1 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: -1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv -1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +# Intel SDM opcode map does not list MPX instructions. For now using Gv for +# bnd registers and Ev for everything else is OK because the instruction +# decoder does not use the information except as an indication that there is +# a ModR/M byte. +1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev +1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: 1d: 1e: @@ -732,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff +c8: sha1nexte Vdq,Wdq +c9: sha1msg1 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq +cb: sha256rnds2 Vdq,Wdq +cc: sha256msg1 Vdq,Wdq +cd: sha256msg2 Vdq,Wdq db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -790,6 +800,7 @@ AVXcode: 3 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) EndTable @@ -874,7 +885,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: +5: rdpkru (110),(11B) | wrpkru (111),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable @@ -888,6 +899,9 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq +3: xrstors +4: xsavec +5: xsaves 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable @@ -932,8 +946,8 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) -7: clflush | sfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) +7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) EndTable GrpTable: Grp16 diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index dd76a05729b0..024f6e971174 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -169,6 +169,76 @@ void fxch_i(void) fpu_tag_word = tag_word; } +static void fcmovCC(void) +{ + /* fcmovCC st(i) */ + int i = FPU_rm; + FPU_REG *st0_ptr = &st(0); + FPU_REG *sti_ptr = &st(i); + long tag_word = fpu_tag_word; + int regnr = top & 7; + int regnri = (top + i) & 7; + u_char sti_tag = (tag_word >> (regnri * 2)) & 3; + + if (sti_tag == TAG_Empty) { + FPU_stack_underflow(); + clear_C1(); + return; + } + reg_copy(sti_ptr, st0_ptr); + tag_word &= ~(3 << (regnr * 2)); + tag_word |= (sti_tag << (regnr * 2)); + fpu_tag_word = tag_word; +} + +void fcmovb(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_CF) + fcmovCC(); +} + +void fcmove(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_ZF) + fcmovCC(); +} + +void fcmovbe(void) +{ + if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)) + fcmovCC(); +} + +void fcmovu(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_PF) + fcmovCC(); +} + +void fcmovnb(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_CF)) + fcmovCC(); +} + +void fcmovne(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_ZF)) + fcmovCC(); +} + +void fcmovnbe(void) +{ + if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))) + fcmovCC(); +} + +void fcmovnu(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_PF)) + fcmovCC(); +} + void ffree_(void) { /* ffree st(i) */ diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index 4dae511c85ad..afbc4d805d66 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -71,7 +71,7 @@ #include "fpu_system.h" -#include <asm/sigcontext.h> /* for struct _fpstate */ +#include <uapi/asm/sigcontext.h> /* for struct _fpstate */ #include <asm/math_emu.h> #include <linux/linkage.h> diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 3d8f2e421466..e945fedf1de2 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -40,49 +40,33 @@ #define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ +/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */ -/* WARNING: These codes are not documented by Intel in their 80486 manual - and may not work on FPU clones or later Intel FPUs. */ - -/* Changes to support the un-doc codes provided by Linus Torvalds. */ - -#define _d9_d8_ fstp_i /* unofficial code (19) */ -#define _dc_d0_ fcom_st /* unofficial code (14) */ -#define _dc_d8_ fcompst /* unofficial code (1c) */ -#define _dd_c8_ fxch_i /* unofficial code (0d) */ -#define _de_d0_ fcompst /* unofficial code (16) */ -#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ -#define _df_c8_ fxch_i /* unofficial code (0f) */ -#define _df_d0_ fstp_i /* unofficial code (17) */ -#define _df_d8_ fstp_i /* unofficial code (1f) */ +/* WARNING: "u" entries are not documented by Intel in their 80486 manual + and may not work on FPU clones or later Intel FPUs. + Changes to support them provided by Linus Torvalds. */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, - fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, - fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, +/* Opcode: d8 d9 da db */ +/* dc dd de df */ +/* c0..7 */ fadd__, fld_i_, fcmovb, fcmovnb, +/* c0..7 */ fadd_i, ffree_, faddp_, ffreep,/*u*/ +/* c8..f */ fmul__, fxch_i, fcmove, fcmovne, +/* c8..f */ fmul_i, fxch_i,/*u*/ fmulp_, fxch_i,/*u*/ +/* d0..7 */ fcom_st, fp_nop, fcmovbe, fcmovnbe, +/* d0..7 */ fcom_st,/*u*/ fst_i_, fcompst,/*u*/ fstp_i,/*u*/ +/* d8..f */ fcompst, fstp_i,/*u*/ fcmovu, fcmovnu, +/* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/ +/* e0..7 */ fsub__, FPU_etc, __BAD__, finit_, +/* e0..7 */ fsubri, fucom_, fsubrp, fstsw_, +/* e8..f */ fsubr_, fconst, fucompp, fucomi_, +/* e8..f */ fsub_i, fucomp, fsubp_, fucomip, +/* f0..7 */ fdiv__, FPU_triga, __BAD__, fcomi_, +/* f0..7 */ fdivri, __BAD__, fdivrp, fcomip, +/* f8..f */ fdivr_, FPU_trigb, __BAD__, __BAD__, +/* f8..f */ fdiv_i, __BAD__, fdivp_, __BAD__, }; -#else /* Support only documented FPU op-codes */ - -static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, - fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, - fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, -}; - -#endif /* NO_UNDOC_CODE */ - #define _NONE_ 0 /* Take no special action */ #define _REG0_ 1 /* Need to check for not empty st(0) */ #define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ @@ -94,36 +78,18 @@ static FUNC const st_instr_table[64] = { #define _REGIc 0 /* Compare st(0) and st(rm) */ #define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ -#ifndef NO_UNDOC_CODE - -/* Un-documented FPU op-codes supported by default. (see above) */ - static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, - _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ +/* Opcode: d8 d9 da db dc dd de df */ +/* c0..7 */ _REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_, +/* c8..f */ _REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_, +/* d0..7 */ _REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_, +/* d8..f */ _REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_, +/* e0..7 */ _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, +/* e8..f */ _REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc, +/* f0..7 */ _REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc, +/* f8..f */ _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, }; -#else /* Support only documented FPU op-codes */ - -static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, - _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ -}; - -#endif /* NO_UNDOC_CODE */ - #ifdef RE_ENTRANT_CHECKING u_char emulating = 0; #endif /* RE_ENTRANT_CHECKING */ diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 9779df436b7d..caff438b9c1d 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -46,6 +46,14 @@ extern void fstsw_(void); extern void fp_nop(void); extern void fld_i_(void); extern void fxch_i(void); +extern void fcmovb(void); +extern void fcmove(void); +extern void fcmovbe(void); +extern void fcmovu(void); +extern void fcmovnb(void); +extern void fcmovne(void); +extern void fcmovnbe(void); +extern void fcmovnu(void); extern void ffree_(void); extern void ffreep(void); extern void fst_i_(void); @@ -108,6 +116,10 @@ extern void fcompp(void); extern void fucom_(void); extern void fucomp(void); extern void fucompp(void); +extern void fcomi_(void); +extern void fcomip(void); +extern void fucomi_(void); +extern void fucomip(void); /* reg_constant.c */ extern void fconst(void); /* reg_ld_str.c */ diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 2931ff355218..95228ff042c0 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -33,11 +33,12 @@ #define pop_0() { FPU_settag0(TAG_Empty); top++; } +/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */ static u_char const type_table[32] = { - _PUSH_, _PUSH_, _PUSH_, _PUSH_, - _null_, _null_, _null_, _null_, - _REG0_, _REG0_, _REG0_, _REG0_, - _REG0_, _REG0_, _REG0_, _REG0_, + _PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32, db:fild m32, dd:fld f64, df:fild m16 */ + _null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef, db,dd,df:fisttp m32/64/16 */ + _REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32, db:fist m32, dd:fst f64, df:fist m16 */ + _REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */ _NONE_, _null_, _NONE_, _PUSH_, _NONE_, _PUSH_, _null_, _PUSH_, _NONE_, _null_, _NONE_, _REG0_, @@ -45,15 +46,19 @@ static u_char const type_table[32] = { }; u_char const data_sizes_16[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, + 4, 4, 8, 2, + 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */ + 4, 4, 8, 2, + 4, 4, 8, 2, 14, 0, 94, 10, 2, 10, 0, 8, 14, 0, 94, 10, 2, 10, 2, 8 }; static u_char const data_sizes_32[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, + 4, 4, 8, 2, + 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */ + 4, 4, 8, 2, + 4, 4, 8, 2, 28, 0, 108, 10, 2, 10, 0, 8, 28, 0, 108, 10, 2, 10, 2, 8 }; @@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, FPU_REG *st0_ptr; u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ u_char loaded_tag; + int sv_cw; st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ @@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } switch (type) { - case 000: /* fld m32real */ + /* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */ + case 000: /* fld m32real (d9 /0) */ clear_C1(); loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data); @@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 001: /* fild m32int */ + case 001: /* fild m32int (db /0) */ clear_C1(); loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 002: /* fld m64real */ + case 002: /* fld m64real (dd /0) */ clear_C1(); loaded_tag = FPU_load_double((double __user *)data_address, @@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 003: /* fild m16int */ + case 003: /* fild m16int (df /0) */ clear_C1(); loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); FPU_copy_to_reg0(&loaded_data, loaded_tag); break; + /* case 004: undefined (d9 /1) */ + /* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */ + case 005: /* fisttp m32int (db /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int32 + (st0_ptr, st0_tag, (long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; + case 006: /* fisttp m64int (dd /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int64 + (st0_ptr, st0_tag, (long long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; + case 007: /* fisttp m16int (df /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int16 + (st0_ptr, st0_tag, (short __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; case 010: /* fst m32real */ clear_C1(); FPU_store_single(st0_ptr, st0_tag, diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index ecce55fc2e2e..b77360fdbf4a 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -249,6 +249,54 @@ static int compare_st_st(int nr) return 0; } +static int compare_i_st_st(int nr) +{ + int f, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + partial_status &= ~SW_C0; + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + + switch (c & 7) { + case COMP_A_lt_B: + f = X86_EFLAGS_CF; + break; + case COMP_A_eq_B: + f = X86_EFLAGS_ZF; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL | 0x122); + f = 0; + break; +#endif /* PARANOID */ + } + FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; +} + static int compare_u_st_st(int nr) { int f = 0, c; @@ -299,6 +347,58 @@ static int compare_u_st_st(int nr) return 0; } +static int compare_ui_st_st(int nr) +{ + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + partial_status &= ~SW_C0; + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + if (c & COMP_SNaN) { /* This is the only difference between + un-ordered and ordinary comparisons */ + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + return 0; + } + + switch (c & 7) { + case COMP_A_lt_B: + f = X86_EFLAGS_CF; + break; + case COMP_A_eq_B: + f = X86_EFLAGS_ZF; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL | 0x123); + f = 0; + break; +#endif /* PARANOID */ + } + FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; +} + /*---------------------------------------------------------------------------*/ void fcom_st(void) @@ -348,3 +448,31 @@ void fucompp(void) } else FPU_illegal(); } + +/* P6+ compare-to-EFLAGS ops */ + +void fcomi_(void) +{ + /* fcomi st(i) */ + compare_i_st_st(FPU_rm); +} + +void fcomip(void) +{ + /* fcomip st(i) */ + if (!compare_i_st_st(FPU_rm)) + FPU_pop(); +} + +void fucomi_(void) +{ + /* fucomi st(i) */ + compare_ui_st_st(FPU_rm); +} + +void fucomip(void) +{ + /* fucomip st(i) */ + if (!compare_ui_st_st(FPU_rm)) + FPU_pop(); +} diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a482d105172b..f9d38a48e3c8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,7 +14,8 @@ obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c new file mode 100644 index 000000000000..bfcffdf6c577 --- /dev/null +++ b/arch/x86/mm/debug_pagetables.c @@ -0,0 +1,46 @@ +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/pgtable.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk_pgd_level(m, NULL); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .owner = THIS_MODULE, + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *pe; + +static int __init pt_dump_debug_init(void) +{ + pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +static void __exit pt_dump_debug_exit(void) +{ + debugfs_remove_recursive(pe); +} + +module_init(pt_dump_debug_init); +module_exit(pt_dump_debug_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f0cedf3395af..4a6f1d9b5106 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -32,6 +32,8 @@ struct pg_state { const struct addr_marker *marker; unsigned long lines; bool to_dmesg; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -87,7 +89,7 @@ static struct addr_marker address_markers[] = { { 0/* VMALLOC_START */, "vmalloc() Area" }, { 0/*VMALLOC_END*/, "vmalloc() End" }, # ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, + { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, # endif { 0/*FIXADDR_START*/, "Fixmap Area" }, #endif @@ -155,7 +157,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); if ((level == 4 && pr & _PAGE_PAT) || ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) - pt_dump_cont_printf(m, dmsg, "pat "); + pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) @@ -198,8 +200,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; - cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; + prot = pgprot_val(new_prot); + cur = pgprot_val(st->current_prot); if (!st->level) { /* First entry */ @@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st, const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; + pgprotval_t pr = pgprot_val(st->current_prot); + + if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + WARN_ONCE(1, + "x86/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, + (void *)st->start_address); + st->wx_pages += (st->current_address - + st->start_address) / PAGE_SIZE; + } /* * Now print the actual finished series @@ -269,13 +281,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, { int i; pte_t *start; + pgprotval_t prot; start = (pte_t *) pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - pgprot_t prot = pte_pgprot(*start); - + prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, prot, 4); + note_page(m, st, __pgprot(prot), 4); start++; } } @@ -287,18 +299,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, { int i; pmd_t *start; + pgprotval_t prot; start = (pmd_t *) pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; - - if (pmd_large(*start) || !pmd_present(*start)) + if (pmd_large(*start) || !pmd_present(*start)) { + prot = pmd_flags(*start); note_page(m, st, __pgprot(prot), 3); - else + } else { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 3); start++; @@ -318,19 +331,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, { int i; pud_t *start; + pgprotval_t prot; start = (pud_t *) pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; - - if (pud_large(*start) || !pud_present(*start)) + if (pud_large(*start) || !pud_present(*start)) { + prot = pud_flags(*start); note_page(m, st, __pgprot(prot), 2); - else + } else { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 2); @@ -344,13 +358,30 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +#ifdef CONFIG_X86_64 +static inline bool is_hypervisor_range(int idx) +{ + /* + * ffff800000000000 - ffff87ffffffffff is reserved for + * the hypervisor. + */ + return paravirt_enabled() && + (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); +} +#else +static inline bool is_hypervisor_range(int idx) { return false; } +#endif + +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + bool checkwx) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_level4_pgt; #else pgd_t *start = swapper_pg_dir; #endif + pgprotval_t prot; int i; struct pg_state st = {}; @@ -359,16 +390,20 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) st.to_dmesg = true; } + st.check_wx = checkwx; + if (checkwx) + st.wx_pages = 0; + for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); - if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; - - if (pgd_large(*start) || !pgd_present(*start)) + if (!pgd_none(*start) && !is_hypervisor_range(i)) { + if (pgd_large(*start) || !pgd_present(*start)) { + prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); - else + } else { walk_pud_level(m, &st, *start, i * PGD_LEVEL_MULT); + } } else note_page(m, &st, __pgprot(0), 1); @@ -378,30 +413,28 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); note_page(m, &st, __pgprot(0), 0); + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); } -static int ptdump_show(struct seq_file *m, void *v) +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { - ptdump_walk_pgd_level(m, NULL); - return 0; + ptdump_walk_pgd_level_core(m, pgd, false); } +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); -static int ptdump_open(struct inode *inode, struct file *filp) +void ptdump_walk_pgd_level_checkwx(void) { - return single_open(filp, ptdump_show, NULL); + ptdump_walk_pgd_level_core(NULL, NULL, true); } -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int pt_dump_init(void) +static int __init pt_dump_init(void) { - struct dentry *pe; - #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; @@ -412,11 +445,6 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif - pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, - &ptdump_fops); - if (!pe) - return -ENOMEM; - return 0; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 81bf3d2af3eb..6d5eb5900372 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -9,6 +9,7 @@ #include <linux/vmstat.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/memremap.h> #include <asm/pgtable.h> @@ -63,6 +64,16 @@ retry: #endif } +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -71,7 +82,9 @@ retry: static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; unsigned long mask; + int nr_start = *nr; pte_t *ptep; mask = _PAGE_PRESENT|_PAGE_USER; @@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } - if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + page = pte_page(pte); + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + pte_unmap(ptep); + return 0; + } + } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); get_page(page); + put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -114,31 +135,58 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + unsigned long pfn = pmd_pfn(pmd); + struct dev_pagemap *pgmap = NULL; + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pmd; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pmd_flags(pmd) & mask) != mask) return 0; + + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); + if (pmd_devmap(pmd)) + return __gup_device_huge_pmd(pmd, addr, end, pages, nr); + /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); refs = 0; - head = pte_page(pte); + head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -159,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - /* - * The pmd_trans_splitting() check below explains why - * pmdp_splitting_flush has to flush the tlb, to stop - * this gup-fast code from running while we set the - * splitting bit in the pmd. Returning zero will take - * the slow path that will call wait_split_huge_page() - * if the pmd is still in splitting state. gup-fast - * can't because it has irq disabled and - * wait_split_huge_page() would never return as the - * tlb flush IPI wouldn't run. - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* @@ -195,27 +232,24 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pud; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pud_flags(pud) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; - head = pte_page(pte); + head = pud_page(pud); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index eecb207a2037..a6d739258137 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -104,20 +104,6 @@ void __kunmap_atomic(void *kvaddr) } EXPORT_SYMBOL(__kunmap_atomic); -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} -EXPORT_SYMBOL(kmap_atomic_to_page); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 42982b26e32b..740d7ac03a55 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static __init int gigantic_pages_init(void) { - /* With CMA we can allocate gigantic pages at runtime */ + /* With compaction or CMA we can allocate gigantic pages at runtime */ if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d8a83df153a..493f54172b4a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -354,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + pr_debug(" [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, page_size_string(&mr[i])); @@ -401,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long ret = 0; int nr_range, i; - pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", + pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", start, end - 1); memset(mr, 0, sizeof(mr)); @@ -693,14 +693,12 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { -#ifdef CONFIG_MICROCODE_EARLY /* * Remember, initrd memory may contain microcode or other useful things. * Before we lose initrd mem, we need to find a place to hold them * now that normal virtual memory is enabled. */ save_microcode_in_initrd(); -#endif /* * end could be not aligned, and We can not align that, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7562f42914b4..cb4ef3de61f9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -957,6 +957,8 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df48430c279b..5488d21123bd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/memremap.h> #include <linux/nmi.h> #include <linux/gfp.h> #include <linux/kcore.h> @@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; + struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); + + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + return; + } /* bootmem page has reserved flag */ if (PageReserved(page)) { @@ -814,8 +821,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (phys_addr < (phys_addr_t)0x40000000) return; - if (IS_ALIGNED(addr, PAGE_SIZE) && - IS_ALIGNED(next, PAGE_SIZE)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were * freed when offlining, or simplely not in use. @@ -1018,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); + struct vmem_altmap *altmap; struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - kernel_physical_mapping_remove(start, start + size); + /* With altmap the first mapped page is offset from @start */ + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + zone = page_zone(page); ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + kernel_physical_mapping_remove(start, start + size); return ret; } @@ -1150,6 +1162,8 @@ void mark_rodata_ro(void) free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); + + debug_checkwx(); } #endif @@ -1234,7 +1248,7 @@ static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node) + unsigned long end, int node, struct vmem_altmap *altmap) { unsigned long addr; unsigned long next; @@ -1257,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (pmd_none(*pmd)) { void *p; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { pte_t entry; @@ -1268,7 +1282,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); addr_start = addr; node_start = node; @@ -1278,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; continue; - } + } else if (altmap) + return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); continue; @@ -1292,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { + struct vmem_altmap *altmap = to_vmem_altmap(start); int err; if (cpu_has_pse) - err = vmemmap_populate_hugepages(start, end, node); - else + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else err = vmemmap_populate_basepages(start, end, node); if (!err) sync_global_pgds(start, end - 1, 0); @@ -1366,7 +1386,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, void __meminit vmemmap_populate_print_last(void) { if (p_start) { - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); p_start = NULL; p_end = NULL; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9c78f3bcd67..0d8d53d1f5cc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size)) + pr_warn("caller %pS mapping multiple BARs\n", caller); return ret_addr; err_free_area: diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ce5da27b136..d470cf219a2d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -126,5 +126,5 @@ void __init kasan_init(void) __flush_tlb_all(); init_task.kasan_depth = 0; - pr_info("Kernel address sanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 844b06d67df4..96bd1e2bffaf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void) { unsigned long rnd; - /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ if (mmap_is_ia32()) - rnd = (unsigned long)get_random_int() % (1<<8); +#ifdef CONFIG_COMPAT + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); +#else + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); +#endif else - rnd = (unsigned long)get_random_int() % (1<<28); + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 134948b0926f..b2fd67da1701 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value) == 1) + if (X86_REX_X(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; @@ -237,7 +237,8 @@ bad_opcode: */ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { - const struct bndreg *bndregs, *bndreg; + const struct mpx_bndreg_state *bndregs; + const struct mpx_bndreg *bndreg; siginfo_t *info = NULL; struct insn insn; uint8_t bndregno; @@ -258,13 +259,13 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) goto err_out; } /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XSTATE_BNDREGS); + bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; } /* now go select the individual register in the set of 4 */ - bndreg = &bndregs[bndregno]; + bndreg = &bndregs->bndreg[bndregno]; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { @@ -306,7 +307,7 @@ err_out: static __user void *mpx_get_bounds_dir(void) { - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; if (!cpu_feature_enabled(X86_FEATURE_MPX)) return MPX_INVALID_BOUNDS_DIR; @@ -315,7 +316,7 @@ static __user void *mpx_get_bounds_dir(void) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -489,10 +490,10 @@ out_unmap: static int do_mpx_bt_fault(void) { unsigned long bd_entry, bd_base; - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; struct mm_struct *mm = current->mm; - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) return -EINVAL; /* @@ -585,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, } /* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + +/* * Get the base of bounds tables pointed by specific bounds * directory entry. */ @@ -604,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm, int need_write = 0; pagefault_disable(); - ret = get_user(bd_entry, bd_entry_ptr); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); pagefault_enable(); if (!ret) break; @@ -699,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, */ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) { - unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - if (is_64bit_mm(mm)) - return virt_space / MPX_BD_NR_ENTRIES_64; - else - return virt_space / MPX_BD_NR_ENTRIES_32; + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit harware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit adddress space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; } /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c3b3f653ed0c..d04f8094bc23 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -469,7 +469,7 @@ static void __init numa_clear_kernel_node_hotplug(void) { int i, nid; nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - unsigned long start, end; + phys_addr_t start, end; struct memblock_region *r; /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c44c0792301..2440814b0069 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -33,7 +33,7 @@ struct cpa_data { pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; - int numpages; + unsigned long numpages; int flags; unsigned long pfn; unsigned force_split : 1; @@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages) static void split_page_count(int level) { + if (direct_pages_count[level] == 0) + return; + direct_pages_count[level]--; direct_pages_count[level - 1] += PTRS_PER_PTE; } @@ -129,14 +132,16 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; - void *p; + + if (p >= vend) + return; mb(); - for (p = (void *)((unsigned long)vaddr & ~clflush_mask); - p < vend; p += boot_cpu_data.x86_clflush_size) + for (; p < vend; p += clflush_size) clflushopt(p); mb(); @@ -414,18 +419,28 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - phys_addr_t phys_addr; - unsigned long offset; + unsigned long phys_addr, offset; enum pg_level level; - unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - pmask = page_level_mask(level); - offset = virt_addr & ~pmask; - phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - return (phys_addr | offset); + + switch (level) { + case PG_LEVEL_1G: + phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_PAGE_MASK; + break; + case PG_LEVEL_2M: + phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_PAGE_MASK; + break; + default: + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); @@ -458,7 +473,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -478,17 +493,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: -#ifdef CONFIG_X86_64 + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + break; case PG_LEVEL_1G: -#endif - psize = page_level_size(level); - pmask = page_level_mask(level); + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); break; default: do_split = -EINVAL; goto out_unlock; } + psize = page_level_size(level); + pmask = page_level_mask(level); + /* * Calculate the number of pages, which fit into this large * page starting at address: @@ -504,7 +523,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); + req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -530,10 +549,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, req_prot = canon_pgprot(req_prot); /* - * old_pte points to the large page base address. So we need + * old_pfn points to the large page base pfn. So we need * to add the offset of the virtual address: */ - pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; new_prot = static_protections(req_prot, address, pfn); @@ -544,7 +563,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address & pmask; - pfn = pte_pfn(old_pte); + pfn = old_pfn; for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn); @@ -574,7 +593,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), new_prot); + new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; @@ -591,7 +610,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { pte_t *pbase = (pte_t *)page_address(base); - unsigned long pfn, pfninc = 1; + unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; pte_t *tmp; pgprot_t ref_prot; @@ -608,26 +627,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); - ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* promote PAT bit to correct position */ - if (level == PG_LEVEL_2M) + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* clear PSE and promote PAT bit to correct position */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + break; -#ifdef CONFIG_X86_64 - if (level == PG_LEVEL_1G) { + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + /* - * Set the PSE flags only if the PRESENT flag is set + * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true * even on a non present pmd. */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_PSE; - else + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; } -#endif /* * Set the GLOBAL flags only if the PRESENT flag is set @@ -643,13 +669,16 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, /* * Get the target pfn from the original entry: */ - pfn = pte_pfn(*kpte); + pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); - if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), - PFN_DOWN(__pa(address)) + 1)) - split_page_count(level); + if (virt_addr_valid(address)) { + unsigned long pfn = PFN_DOWN(__pa(address)); + + if (pfn_range_is_mapped(pfn, pfn + 1)) + split_page_count(level); + } /* * Install the new, split up pagetable. @@ -1321,7 +1350,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages); + BUG_ON(cpa->numpages > numpages || !cpa->numpages); numpages -= cpa->numpages; if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) cpa->curpage++; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 188e3e07eeeb..f4ae536b0914 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,6 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> @@ -586,7 +587,7 @@ int free_memtype(u64 start, u64 end) entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (!entry) { + if (IS_ERR(entry)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; @@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, } int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn) + pfn_t pfn) { enum page_cache_mode pcm; @@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, return 0; /* Set prot based on lookup */ - pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); @@ -992,6 +993,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, vma->vm_flags &= ~VM_PAT; } +/* + * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, + * with the old vma after its pfnmap page table has been removed. The new + * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + */ +void untrack_pfn_moved(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_PAT; +} + pgprot_t pgprot_writecombine(pgprot_t prot) { return __pgprot(pgprot_val(prot) | diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 63931080366a..2f7702253ccf 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, return last_lower; /* Returns NULL if there is no overlap */ } -static struct memtype *memtype_rb_exact_match(struct rb_root *root, - u64 start, u64 end) +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_rb_match(struct rb_root *root, + u64 start, u64 end, int match_type) { struct memtype *match; @@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, while (match != NULL && match->start < end) { struct rb_node *node; - if (match->start == start && match->end == end) + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) return match; node = rb_next(&match->rb); @@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, match = NULL; } - return NULL; /* Returns NULL if there is no exact match */ + return NULL; /* Returns NULL if there is no match */ } static int memtype_rb_check_conflict(struct rb_root *root, @@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) { struct memtype *data; - data = memtype_rb_exact_match(&memtype_rbroot, start, end); - if (!data) - goto out; + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + } else { + /* mremap: update the end value of this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + data->end = start; + data->subtree_max_end = data->end; + memtype_rb_insert(&memtype_rbroot, data); + return NULL; + } - rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); -out: return data; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index fb0a9dd1d6e4..4eb287e25043 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -414,7 +414,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; - pte_update_defer(vma->vm_mm, address, ptep); + pte_update(vma->vm_mm, address, ptep); } return changed; @@ -431,7 +431,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; - pmd_update_defer(vma->vm_mm, address, pmdp); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -469,9 +468,6 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); - if (ret) - pmd_update(vma->vm_mm, addr, pmdp); - return ret; } #endif @@ -509,20 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } - -void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - int set; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)pmdp); - if (set) { - pmd_update(vma->vm_mm, address, pmdp); - /* need tlb flush only to serialize against gup-fast */ - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } -} #endif /** diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 90555bf60aa4..92e2eacb3321 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -31,7 +31,7 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (cpu_has_nx && !disable_nx) + if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) __supported_pte_mask |= _PAGE_NX; else __supported_pte_mask &= ~_PAGE_NX; @@ -39,7 +39,7 @@ void x86_configure_nx(void) void __init x86_report_nx(void) { - if (!cpu_has_nx) { + if (!boot_cpu_has(X86_FEATURE_NX)) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " "missing in CPU!\n"); } else { diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index c2aea63bee20..b5f821881465 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", (unsigned long long)start, (unsigned long long)end - 1); + max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); + return 0; out_err_bad_srat: bad_srat(); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8ddb5d0d66fb..8f4cc3dfac32 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,7 +161,10 @@ void flush_tlb_current_task(void) preempt_disable(); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + + /* This is an implicit full barrier that synchronizes with switch_mm. */ local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); - if (current->active_mm != mm) + if (current->active_mm != mm) { + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; + } if (!current->mm) { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; + /* + * Both branches below are implicit full barriers (MOV to CR or + * INVLPG) that synchronize with switch_mm. + */ if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) preempt_disable(); if (current->active_mm == mm) { - if (current->mm) + if (current->mm) { + /* + * Implicit full barrier (INVLPG) that synchronizes + * with switch_mm. + */ __flush_tlb_one(start); - else + } else { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + } } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 70efcd0940f9..4286f3618bd0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -193,7 +193,7 @@ struct jit_context { 32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 51 +#define PROLOGUE_SIZE 48 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program @@ -229,11 +229,15 @@ static void emit_prologue(u8 **pprog) /* mov qword ptr [rbp-X],r15 */ EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); - /* clear A and X registers */ - EMIT2(0x31, 0xc0); /* xor eax, eax */ - EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls + * we need to reset the counter to 0. It's done in two instructions, + * resetting rax register to 0 (xor on eax gets 0 extended), and + * moving it to the counter location. + */ - /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp-X], rax */ EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); @@ -455,6 +459,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } case BPF_ALU | BPF_MOV | BPF_K: + /* optimization: if imm32 is zero, use 'xor <dst>,<dst>' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + break; + } + /* mov %eax, imm32 */ if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); @@ -469,6 +485,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, return -EINVAL; } + /* optimization: if imm64 is zero, use 'xor <dst>,<dst>' + * to save 7 bytes. + */ + if (insn[0].imm == 0 && insn[1].imm == 0) { + b1 = add_2mod(0x48, dst_reg, dst_reg); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); + + insn++; + i++; + break; + } + /* movabsq %rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); EMIT(insn[0].imm, 4); @@ -1109,7 +1139,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = true; + prog->jited = 1; } out: kfree(addrs); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 5c6fc3577a49..97062a635b77 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -23,6 +23,8 @@ obj-y += bus_numa.o obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o +obj-$(CONFIG_VMD) += vmd.o + ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ff9911707160..3cd69832d7f4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -4,16 +4,15 @@ #include <linux/irq.h> #include <linux/dmi.h> #include <linux/slab.h> +#include <linux/pci-acpi.h> #include <asm/numa.h> #include <asm/pci_x86.h> struct pci_root_info { - struct acpi_device *bridge; - char name[16]; + struct acpi_pci_root_info common; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; - u16 segment; u8 start_bus; u8 end_bus; #endif @@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr) return 0; } -static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, - u8 end, phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { - int result; - struct device *dev = &info->bridge->dev; + int result, seg; + struct pci_root_info *info; + struct acpi_pci_root *root = ci->root; + struct device *dev = &ci->bridge->dev; - info->start_bus = start; - info->end_bus = end; + info = container_of(ci, struct pci_root_info, common); + info->start_bus = (u8)root->secondary.start; + info->end_bus = (u8)root->secondary.end; info->mcfg_added = false; + seg = info->sd.domain; /* return success if MMCFG is not in use */ if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) @@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, if (!(pci_probe & PCI_PROBE_MMCONF)) return check_segment(seg, dev, "MMCONFIG is disabled,"); - result = pci_mmconfig_insert(dev, seg, start, end, addr); + result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus, + root->mcfg_addr); if (result == 0) { /* enable MMCFG if it hasn't been enabled yet */ if (raw_pci_ext_ops == NULL) @@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { + struct pci_root_info *info; + + info = container_of(ci, struct pci_root_info, common); if (info->mcfg_added) { - pci_mmconfig_delete(info->segment, info->start_bus, - info->end_bus); + pci_mmconfig_delete(info->sd.domain, + info->start_bus, info->end_bus); info->mcfg_added = false; } } #else -static int setup_mcfg_map(struct pci_root_info *info, - u16 seg, u8 start, u8 end, - phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) + +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { } #endif -static void validate_resources(struct device *dev, struct list_head *crs_res, - unsigned long type) +static int pci_acpi_root_get_node(struct acpi_pci_root *root) { - LIST_HEAD(list); - struct resource *res1, *res2, *root = NULL; - struct resource_entry *tmp, *entry, *entry2; - - BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0); - root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource; - - list_splice_init(crs_res, &list); - resource_list_for_each_entry_safe(entry, tmp, &list) { - bool free = false; - resource_size_t end; - - res1 = entry->res; - if (!(res1->flags & type)) - goto next; - - /* Exclude non-addressable range or non-addressable portion */ - end = min(res1->end, root->end); - if (end <= res1->start) { - dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n", - res1); - free = true; - goto next; - } else if (res1->end != end) { - dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n", - res1, (unsigned long long)end + 1, - (unsigned long long)res1->end); - res1->end = end; - } - - resource_list_for_each_entry(entry2, crs_res) { - res2 = entry2->res; - if (!(res2->flags & type)) - continue; - - /* - * I don't like throwing away windows because then - * our resources no longer match the ACPI _CRS, but - * the kernel resource tree doesn't allow overlaps. - */ - if (resource_overlaps(res1, res2)) { - res2->start = min(res1->start, res2->start); - res2->end = max(res1->end, res2->end); - dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n", - res2, res1); - free = true; - goto next; - } - } + int busnum = root->secondary.start; + struct acpi_device *device = root->device; + int node = acpi_get_node(device->handle); -next: - resource_list_del(entry); - if (free) - resource_list_free_entry(entry); - else - resource_list_add_tail(entry, crs_res); + if (node == NUMA_NO_NODE) { + node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); } + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + + return node; } -static void add_resources(struct pci_root_info *info, - struct list_head *resources, - struct list_head *crs_res) +static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci) { - struct resource_entry *entry, *tmp; - struct resource *res, *conflict, *root = NULL; - - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM); - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO); - - resource_list_for_each_entry_safe(entry, tmp, crs_res) { - res = entry->res; - if (res->flags & IORESOURCE_MEM) - root = &iomem_resource; - else if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - BUG_ON(res); - - conflict = insert_resource_conflict(root, res); - if (conflict) { - dev_info(&info->bridge->dev, - "ignoring host bridge window %pR (conflicts with %s %pR)\n", - res, conflict->name, conflict); - resource_list_destroy_entry(entry); - } - } - - list_splice_tail(crs_res, resources); + return setup_mcfg_map(ci); } -static void release_pci_root_info(struct pci_host_bridge *bridge) +static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci) { - struct resource *res; - struct resource_entry *entry; - struct pci_root_info *info = bridge->release_data; - - resource_list_for_each_entry(entry, &bridge->windows) { - res = entry->res; - if (res->parent && - (res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) - release_resource(res); - } - - teardown_mcfg_map(info); - kfree(info); + teardown_mcfg_map(ci); + kfree(container_of(ci, struct pci_root_info, common)); } /* @@ -358,50 +282,47 @@ static bool resource_is_pcicfg_ioport(struct resource *res) res->start == 0xCF8 && res->end == 0xCFF; } -static void probe_pci_root_info(struct pci_root_info *info, - struct acpi_device *device, - int busnum, int domain, - struct list_head *list) +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) { - int ret; + struct acpi_device *device = ci->bridge; + int busnum = ci->root->secondary.start; struct resource_entry *entry, *tmp; + int status; - sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); - info->bridge = device; - ret = acpi_dev_get_resources(device, list, - acpi_dev_filter_resource_type_cb, - (void *)(IORESOURCE_IO | IORESOURCE_MEM)); - if (ret < 0) - dev_warn(&device->dev, - "failed to parse _CRS method, error code %d\n", ret); - else if (ret == 0) - dev_dbg(&device->dev, - "no IO and memory resources present in _CRS\n"); - else - resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_DISABLED) || - resource_is_pcicfg_ioport(entry->res)) + status = acpi_pci_probe_root_resources(ci); + if (pci_use_crs) { + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) + if (resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); - else - entry->res->name = info->name; - } + return status; + } + + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { + dev_printk(KERN_DEBUG, &device->dev, + "host bridge window %pR (ignored)\n", entry->res); + resource_list_destroy_entry(entry); + } + x86_pci_root_bus_resources(busnum, &ci->resources); + + return 0; } +static struct acpi_pci_root_ops acpi_pci_root_ops = { + .pci_ops = &pci_root_ops, + .init_info = pci_acpi_root_init_info, + .release_info = pci_acpi_root_release_info, + .prepare_resources = pci_acpi_root_prepare_resources, +}; + struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { - struct acpi_device *device = root->device; - struct pci_root_info *info; int domain = root->segment; int busnum = root->secondary.start; - struct resource_entry *res_entry; - LIST_HEAD(crs_res); - LIST_HEAD(resources); + int node = pci_acpi_root_get_node(root); struct pci_bus *bus; - struct pci_sysdata *sd; - int node; if (pci_ignore_seg) - domain = 0; + root->segment = domain = 0; if (domain && !pci_domains_supported) { printk(KERN_WARNING "pci_bus %04x:%02x: " @@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - node = acpi_get_node(device->handle); - if (node == NUMA_NO_NODE) { - node = x86_pci_root_bus_node(busnum); - if (node != 0 && node != NUMA_NO_NODE) - dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", - node); - } - - if (node != NUMA_NO_NODE && !node_online(node)) - node = NUMA_NO_NODE; - - info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); - if (!info) { - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (out of memory)\n", domain, busnum); - return NULL; - } - - sd = &info->sd; - sd->domain = domain; - sd->node = node; - sd->companion = device; - bus = pci_find_bus(domain, busnum); if (bus) { /* * If the desired bus has been scanned already, replace * its bus->sysdata. */ - memcpy(bus->sysdata, sd, sizeof(*sd)); - kfree(info); - } else { - /* insert busn res at first */ - pci_add_resource(&resources, &root->secondary); + struct pci_sysdata sd = { + .domain = domain, + .node = node, + .companion = root->device + }; - /* - * _CRS with no apertures is normal, so only fall back to - * defaults or native bridge info if we're ignoring _CRS. - */ - probe_pci_root_info(info, device, busnum, domain, &crs_res); - if (pci_use_crs) { - add_resources(info, &resources, &crs_res); - } else { - resource_list_for_each_entry(res_entry, &crs_res) - dev_printk(KERN_DEBUG, &device->dev, - "host bridge window %pR (ignored)\n", - res_entry->res); - resource_list_free(&crs_res); - x86_pci_root_bus_resources(busnum, &resources); - } - - if (!setup_mcfg_map(info, domain, (u8)root->secondary.start, - (u8)root->secondary.end, root->mcfg_addr)) - bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, - sd, &resources); - - if (bus) { - pci_scan_child_bus(bus); - pci_set_host_bridge_release( - to_pci_host_bridge(bus->bridge), - release_pci_root_info, info); - } else { - resource_list_free(&resources); - teardown_mcfg_map(info); - kfree(info); + memcpy(bus->sysdata, &sd, sizeof(sd)); + } else { + struct pci_root_info *info; + + info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); + if (!info) + dev_err(&root->device->dev, + "pci_bus %04x:%02x: ignored (out of memory)\n", + domain, busnum); + else { + info->sd.domain = domain; + info->sd.node = node; + info->sd.companion = root->device; + bus = acpi_pci_root_create(root, &acpi_pci_root_ops, + &info->common, &info->sd); } } @@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) pcie_bus_configure_settings(child); } - if (bus && node != NUMA_NO_NODE) - dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); - return bus; } diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 7bcf06a7cd12..6eb3c8af96e2 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) if (!found) pci_add_resource(resources, &info->busn); - list_for_each_entry(root_res, &info->resources, list) { - struct resource *res; - struct resource *root; + list_for_each_entry(root_res, &info->resources, list) + pci_add_resource(resources, &root_res->res); - res = &root_res->res; - pci_add_resource(resources, res); - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } return; default_resources: diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dc78a4a9a466..2879efc73a96 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -641,6 +641,43 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +static LIST_HEAD(dma_domain_list); +static DEFINE_SPINLOCK(dma_domain_list_lock); + +void add_dma_domain(struct dma_domain *domain) +{ + spin_lock(&dma_domain_list_lock); + list_add(&domain->node, &dma_domain_list); + spin_unlock(&dma_domain_list_lock); +} +EXPORT_SYMBOL_GPL(add_dma_domain); + +void del_dma_domain(struct dma_domain *domain) +{ + spin_lock(&dma_domain_list_lock); + list_del(&domain->node); + spin_unlock(&dma_domain_list_lock); +} +EXPORT_SYMBOL_GPL(del_dma_domain); + +static void set_dma_domain_ops(struct pci_dev *pdev) +{ + struct dma_domain *domain; + + spin_lock(&dma_domain_list_lock); + list_for_each_entry(domain, &dma_domain_list, node) { + if (pci_domain_nr(pdev->bus) == domain->domain_nr) { + pdev->dev.archdata.dma_ops = domain->dma_ops; + break; + } + } + spin_unlock(&dma_domain_list_lock); +} +#else +static void set_dma_domain_ops(struct pci_dev *pdev) {} +#endif + int pcibios_add_device(struct pci_dev *dev) { struct setup_data *data; @@ -670,11 +707,20 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = data->next; iounmap(data); } + set_dma_domain_ops(dev); return 0; } int pcibios_alloc_irq(struct pci_dev *dev) { + /* + * If the PCI device was already claimed by core code and has + * MSI enabled, probing of the pcibios IRQ will overwrite + * dev->irq. So bail out if MSI is already enabled. + */ + if (pci_dev_msi_enabled(dev)) + return -EBUSY; + return pcibios_enable_irq(dev); } diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 5b662c0faf8c..ea6f3802c17b 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn) } EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus); -int __init pci_subsys_init(void) +static int __init pci_subsys_init(void) { /* * The init function returns an non zero value when diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9b83b9051ae7..9770e55e768f 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -180,6 +180,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, unsigned long result = 0; unsigned long flags; unsigned long bx = (bus << 8) | devfn; + u16 number = 0, mask = 0; WARN_ON(seg); if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) @@ -189,53 +190,35 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, switch (len) { case 1: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_BYTE), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); - /* - * Zero-extend the result beyond 8 bits, do not trust the - * BIOS having done it: - */ - *value &= 0xff; + number = PCIBIOS_READ_CONFIG_BYTE; + mask = 0xff; break; case 2: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_WORD), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); - /* - * Zero-extend the result beyond 16 bits, do not trust the - * BIOS having done it: - */ - *value &= 0xffff; + number = PCIBIOS_READ_CONFIG_WORD; + mask = 0xffff; break; case 4: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_DWORD), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_READ_CONFIG_DWORD; break; } + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (result) + : "1" (number), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + /* + * Zero-extend the result beyond 8 or 16 bits, do not trust the + * BIOS having done it: + */ + if (mask) + *value &= mask; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); @@ -247,6 +230,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, unsigned long result = 0; unsigned long flags; unsigned long bx = (bus << 8) | devfn; + u16 number = 0; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) @@ -256,43 +240,27 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, switch (len) { case 1: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_BYTE), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_BYTE; break; case 2: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_WORD), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_WORD; break; case 4: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_DWORD), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_DWORD; break; } + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (result) + : "0" (number), + "c" (value), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c new file mode 100644 index 000000000000..d57e48016f15 --- /dev/null +++ b/arch/x86/pci/vmd.c @@ -0,0 +1,723 @@ +/* + * Volume Management Device driver + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/device.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/msi.h> +#include <linux/pci.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> + +#include <asm/irqdomain.h> +#include <asm/device.h> +#include <asm/msi.h> +#include <asm/msidef.h> + +#define VMD_CFGBAR 0 +#define VMD_MEMBAR1 2 +#define VMD_MEMBAR2 4 + +/* + * Lock for manipulating VMD IRQ lists. + */ +static DEFINE_RAW_SPINLOCK(list_lock); + +/** + * struct vmd_irq - private data to map driver IRQ to the VMD shared vector + * @node: list item for parent traversal. + * @rcu: RCU callback item for freeing. + * @irq: back pointer to parent. + * @virq: the virtual IRQ value provided to the requesting driver. + * + * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to + * a VMD IRQ using this structure. + */ +struct vmd_irq { + struct list_head node; + struct rcu_head rcu; + struct vmd_irq_list *irq; + unsigned int virq; +}; + +/** + * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector + * @irq_list: the list of irq's the VMD one demuxes to. + * @vmd_vector: the h/w IRQ assigned to the VMD. + * @index: index into the VMD MSI-X table; used for message routing. + * @count: number of child IRQs assigned to this vector; used to track + * sharing. + */ +struct vmd_irq_list { + struct list_head irq_list; + struct vmd_dev *vmd; + unsigned int vmd_vector; + unsigned int index; + unsigned int count; +}; + +struct vmd_dev { + struct pci_dev *dev; + + spinlock_t cfg_lock; + char __iomem *cfgbar; + + int msix_count; + struct msix_entry *msix_entries; + struct vmd_irq_list *irqs; + + struct pci_sysdata sysdata; + struct resource resources[3]; + struct irq_domain *irq_domain; + struct pci_bus *bus; + +#ifdef CONFIG_X86_DEV_DMA_OPS + struct dma_map_ops dma_ops; + struct dma_domain dma_domain; +#endif +}; + +static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) +{ + return container_of(bus->sysdata, struct vmd_dev, sysdata); +} + +/* + * Drivers managing a device in a VMD domain allocate their own IRQs as before, + * but the MSI entry for the hardware it's driving will be programmed with a + * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its + * domain into one of its own, and the VMD driver de-muxes these for the + * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations + * and irq_chip to set this up. + */ +static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct vmd_irq *vmdirq = data->chip_data; + struct vmd_irq_list *irq = vmdirq->irq; + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index); + msg->data = 0; +} + +/* + * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops. + */ +static void vmd_irq_enable(struct irq_data *data) +{ + struct vmd_irq *vmdirq = data->chip_data; + + raw_spin_lock(&list_lock); + list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); + raw_spin_unlock(&list_lock); + + data->chip->irq_unmask(data); +} + +static void vmd_irq_disable(struct irq_data *data) +{ + struct vmd_irq *vmdirq = data->chip_data; + + data->chip->irq_mask(data); + + raw_spin_lock(&list_lock); + list_del_rcu(&vmdirq->node); + raw_spin_unlock(&list_lock); +} + +/* + * XXX: Stubbed until we develop acceptable way to not create conflicts with + * other devices sharing the same vector. + */ +static int vmd_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return -EINVAL; +} + +static struct irq_chip vmd_msi_controller = { + .name = "VMD-MSI", + .irq_enable = vmd_irq_enable, + .irq_disable = vmd_irq_disable, + .irq_compose_msi_msg = vmd_compose_msi_msg, + .irq_set_affinity = vmd_irq_set_affinity, +}; + +static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return 0; +} + +/* + * XXX: We can be even smarter selecting the best IRQ once we solve the + * affinity problem. + */ +static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd) +{ + int i, best = 0; + + raw_spin_lock(&list_lock); + for (i = 1; i < vmd->msix_count; i++) + if (vmd->irqs[i].count < vmd->irqs[best].count) + best = i; + vmd->irqs[best].count++; + raw_spin_unlock(&list_lock); + + return &vmd->irqs[best]; +} + +static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus); + struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); + + if (!vmdirq) + return -ENOMEM; + + INIT_LIST_HEAD(&vmdirq->node); + vmdirq->irq = vmd_next_irq(vmd); + vmdirq->virq = virq; + + irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, + vmdirq, handle_simple_irq, vmd, NULL); + return 0; +} + +static void vmd_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + struct vmd_irq *vmdirq = irq_get_chip_data(virq); + + /* XXX: Potential optimization to rebalance */ + raw_spin_lock(&list_lock); + vmdirq->irq->count--; + raw_spin_unlock(&list_lock); + + kfree_rcu(vmdirq, rcu); +} + +static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct vmd_dev *vmd = vmd_from_bus(pdev->bus); + + if (nvec > vmd->msix_count) + return vmd->msix_count; + + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->desc = desc; +} + +static struct msi_domain_ops vmd_msi_domain_ops = { + .get_hwirq = vmd_get_hwirq, + .msi_init = vmd_msi_init, + .msi_free = vmd_msi_free, + .msi_prepare = vmd_msi_prepare, + .set_desc = vmd_set_desc, +}; + +static struct msi_domain_info vmd_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &vmd_msi_domain_ops, + .chip = &vmd_msi_controller, +}; + +#ifdef CONFIG_X86_DEV_DMA_OPS +/* + * VMD replaces the requester ID with its own. DMA mappings for devices in a + * VMD domain need to be mapped for the VMD, not the device requiring + * the mapping. + */ +static struct device *to_vmd_dev(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct vmd_dev *vmd = vmd_from_bus(pdev->bus); + + return &vmd->dev->dev; +} + +static struct dma_map_ops *vmd_dma_ops(struct device *dev) +{ + return to_vmd_dev(dev)->archdata.dma_ops; +} + +static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr, + gfp_t flag, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag, + attrs); +} + +static void vmd_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t addr, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr, + attrs); +} + +static int vmd_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t addr, size_t size, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr, + size, attrs); +} + +static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t addr, size_t size, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr, + addr, size, attrs); +} + +static dma_addr_t vmd_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size, + dir, attrs); +} + +static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs); +} + +static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs); +} + +static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs); +} + +static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir); +} + +static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size, + dir); +} + +static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir); +} + +static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir); +} + +static int vmd_mapping_error(struct device *dev, dma_addr_t addr) +{ + return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr); +} + +static int vmd_dma_supported(struct device *dev, u64 mask) +{ + return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask); +} + +#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK +static u64 vmd_get_required_mask(struct device *dev) +{ + return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev)); +} +#endif + +static void vmd_teardown_dma_ops(struct vmd_dev *vmd) +{ + struct dma_domain *domain = &vmd->dma_domain; + + if (vmd->dev->dev.archdata.dma_ops) + del_dma_domain(domain); +} + +#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \ + do { \ + if (source->fn) \ + dest->fn = vmd_##fn; \ + } while (0) + +static void vmd_setup_dma_ops(struct vmd_dev *vmd) +{ + const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops; + struct dma_map_ops *dest = &vmd->dma_ops; + struct dma_domain *domain = &vmd->dma_domain; + + domain->domain_nr = vmd->sysdata.domain; + domain->dma_ops = dest; + + if (!source) + return; + ASSIGN_VMD_DMA_OPS(source, dest, alloc); + ASSIGN_VMD_DMA_OPS(source, dest, free); + ASSIGN_VMD_DMA_OPS(source, dest, mmap); + ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable); + ASSIGN_VMD_DMA_OPS(source, dest, map_page); + ASSIGN_VMD_DMA_OPS(source, dest, unmap_page); + ASSIGN_VMD_DMA_OPS(source, dest, map_sg); + ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg); + ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu); + ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device); + ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu); + ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device); + ASSIGN_VMD_DMA_OPS(source, dest, mapping_error); + ASSIGN_VMD_DMA_OPS(source, dest, dma_supported); +#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK + ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask); +#endif + add_dma_domain(domain); +} +#undef ASSIGN_VMD_DMA_OPS +#else +static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {} +static void vmd_setup_dma_ops(struct vmd_dev *vmd) {} +#endif + +static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, + unsigned int devfn, int reg, int len) +{ + char __iomem *addr = vmd->cfgbar + + (bus->number << 20) + (devfn << 12) + reg; + + if ((addr - vmd->cfgbar) + len >= + resource_size(&vmd->dev->resource[VMD_CFGBAR])) + return NULL; + + return addr; +} + +/* + * CPU may deadlock if config space is not serialized on some versions of this + * hardware, so all config space access is done under a spinlock. + */ +static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, + int len, u32 *value) +{ + struct vmd_dev *vmd = vmd_from_bus(bus); + char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); + unsigned long flags; + int ret = 0; + + if (!addr) + return -EFAULT; + + spin_lock_irqsave(&vmd->cfg_lock, flags); + switch (len) { + case 1: + *value = readb(addr); + break; + case 2: + *value = readw(addr); + break; + case 4: + *value = readl(addr); + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&vmd->cfg_lock, flags); + return ret; +} + +/* + * VMD h/w converts non-posted config writes to posted memory writes. The + * read-back in this function forces the completion so it returns only after + * the config space was written, as expected. + */ +static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, + int len, u32 value) +{ + struct vmd_dev *vmd = vmd_from_bus(bus); + char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); + unsigned long flags; + int ret = 0; + + if (!addr) + return -EFAULT; + + spin_lock_irqsave(&vmd->cfg_lock, flags); + switch (len) { + case 1: + writeb(value, addr); + readb(addr); + break; + case 2: + writew(value, addr); + readw(addr); + break; + case 4: + writel(value, addr); + readl(addr); + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&vmd->cfg_lock, flags); + return ret; +} + +static struct pci_ops vmd_ops = { + .read = vmd_pci_read, + .write = vmd_pci_write, +}; + +/* + * VMD domains start at 0x1000 to not clash with ACPI _SEG domains. + */ +static int vmd_find_free_domain(void) +{ + int domain = 0xffff; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus)) != NULL) + domain = max_t(int, domain, pci_domain_nr(bus)); + return domain + 1; +} + +static int vmd_enable_domain(struct vmd_dev *vmd) +{ + struct pci_sysdata *sd = &vmd->sysdata; + struct resource *res; + u32 upper_bits; + unsigned long flags; + LIST_HEAD(resources); + + res = &vmd->dev->resource[VMD_CFGBAR]; + vmd->resources[0] = (struct resource) { + .name = "VMD CFGBAR", + .start = res->start, + .end = (resource_size(res) >> 20) - 1, + .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, + }; + + res = &vmd->dev->resource[VMD_MEMBAR1]; + upper_bits = upper_32_bits(res->end); + flags = res->flags & ~IORESOURCE_SIZEALIGN; + if (!upper_bits) + flags &= ~IORESOURCE_MEM_64; + vmd->resources[1] = (struct resource) { + .name = "VMD MEMBAR1", + .start = res->start, + .end = res->end, + .flags = flags, + }; + + res = &vmd->dev->resource[VMD_MEMBAR2]; + upper_bits = upper_32_bits(res->end); + flags = res->flags & ~IORESOURCE_SIZEALIGN; + if (!upper_bits) + flags &= ~IORESOURCE_MEM_64; + vmd->resources[2] = (struct resource) { + .name = "VMD MEMBAR2", + .start = res->start + 0x2000, + .end = res->end, + .flags = flags, + }; + + sd->domain = vmd_find_free_domain(); + if (sd->domain < 0) + return sd->domain; + + sd->node = pcibus_to_node(vmd->dev->bus); + + vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info, + NULL); + if (!vmd->irq_domain) + return -ENODEV; + + pci_add_resource(&resources, &vmd->resources[0]); + pci_add_resource(&resources, &vmd->resources[1]); + pci_add_resource(&resources, &vmd->resources[2]); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd, + &resources); + if (!vmd->bus) { + pci_free_resource_list(&resources); + irq_domain_remove(vmd->irq_domain); + return -ENODEV; + } + + vmd_setup_dma_ops(vmd); + dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain); + pci_rescan_bus(vmd->bus); + + WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj, + "domain"), "Can't create symlink to domain\n"); + return 0; +} + +static irqreturn_t vmd_irq(int irq, void *data) +{ + struct vmd_irq_list *irqs = data; + struct vmd_irq *vmdirq; + + rcu_read_lock(); + list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) + generic_handle_irq(vmdirq->virq); + rcu_read_unlock(); + + return IRQ_HANDLED; +} + +static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct vmd_dev *vmd; + int i, err; + + if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20)) + return -ENOMEM; + + vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL); + if (!vmd) + return -ENOMEM; + + vmd->dev = dev; + err = pcim_enable_device(dev); + if (err < 0) + return err; + + vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0); + if (!vmd->cfgbar) + return -ENOMEM; + + pci_set_master(dev); + if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) + return -ENODEV; + + vmd->msix_count = pci_msix_vec_count(dev); + if (vmd->msix_count < 0) + return -ENODEV; + + vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs), + GFP_KERNEL); + if (!vmd->irqs) + return -ENOMEM; + + vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count, + sizeof(*vmd->msix_entries), + GFP_KERNEL); + if (!vmd->msix_entries) + return -ENOMEM; + for (i = 0; i < vmd->msix_count; i++) + vmd->msix_entries[i].entry = i; + + vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1, + vmd->msix_count); + if (vmd->msix_count < 0) + return vmd->msix_count; + + for (i = 0; i < vmd->msix_count; i++) { + INIT_LIST_HEAD(&vmd->irqs[i].irq_list); + vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector; + vmd->irqs[i].index = i; + + err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector, + vmd_irq, 0, "vmd", &vmd->irqs[i]); + if (err) + return err; + } + + spin_lock_init(&vmd->cfg_lock); + pci_set_drvdata(dev, vmd); + err = vmd_enable_domain(vmd); + if (err) + return err; + + dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n", + vmd->sysdata.domain); + return 0; +} + +static void vmd_remove(struct pci_dev *dev) +{ + struct vmd_dev *vmd = pci_get_drvdata(dev); + + pci_set_drvdata(dev, NULL); + sysfs_remove_link(&vmd->dev->dev.kobj, "domain"); + pci_stop_root_bus(vmd->bus); + pci_remove_root_bus(vmd->bus); + vmd_teardown_dma_ops(vmd); + irq_domain_remove(vmd->irq_domain); +} + +#ifdef CONFIG_PM +static int vmd_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + pci_save_state(pdev); + return 0; +} + +static int vmd_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + pci_restore_state(pdev); + return 0; +} +#endif +static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume); + +static const struct pci_device_id vmd_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),}, + {0,} +}; +MODULE_DEVICE_TABLE(pci, vmd_ids); + +static struct pci_driver vmd_drv = { + .name = "vmd", + .id_table = vmd_ids, + .probe = vmd_probe, + .remove = vmd_remove, + .driver = { + .pm = &vmd_dev_pm_ops, + }, +}; +module_pci_driver(vmd_drv); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION("0.6"); diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 5ca8ead91579..81c769e80614 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -25,8 +25,6 @@ #include <asm/cpu_device_id.h> #include <asm/iosf_mbi.h> -/* Side band Interface port */ -#define PUNIT_PORT 0x04 /* Power gate status reg */ #define PWRGT_STATUS 0x61 /* Subsystem config/status Video processor */ @@ -85,9 +83,8 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused) seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); while (punit_devp->name) { - status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, - punit_devp->reg, - &punit_pwr_status); + status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ, + punit_devp->reg, &punit_pwr_status); if (status) { seq_printf(seq_file, "%9s : Read Failed\n", punit_devp->name); diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d7f997f7c26d..ea48449b2e63 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -50,11 +50,16 @@ void __init efi_bgrt_init(void) bgrt_tab->version); return; } - if (bgrt_tab->status != 1) { - pr_err("Ignoring BGRT: invalid status %u (expected 1)\n", + if (bgrt_tab->status & 0xfe) { + pr_err("Ignoring BGRT: reserved status bits are non-zero %u\n", bgrt_tab->status); return; } + if (bgrt_tab->status != 1) { + pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n", + bgrt_tab->status); + return; + } if (bgrt_tab->image_type != 0) { pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", bgrt_tab->image_type); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 6a28ded74211..ad285404ea7f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -194,7 +194,7 @@ static void __init do_add_efi_memmap(void) int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; - unsigned long pmap; + phys_addr_t pmap; if (efi_enabled(EFI_PARAVIRT)) return 0; @@ -209,7 +209,7 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = (void *)pmap; + memmap.phys_map = pmap; memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; memmap.desc_size = e->efi_memdesc_size; @@ -222,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } -static void __init print_efi_memmap(void) +void __init efi_print_memmap(void) { #ifdef EFI_DEBUG efi_memory_desc_t *md; @@ -524,7 +524,7 @@ void __init efi_init(void) return; if (efi_enabled(EFI_DBG)) - print_efi_memmap(); + efi_print_memmap(); efi_esrt_init(); } @@ -1017,24 +1017,6 @@ u32 efi_mem_type(unsigned long phys_addr) return 0; } -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - if (!efi_enabled(EFI_MEMMAP)) - return 0; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->attribute; - } - return 0; -} - static int __init arch_parse_efi_cmdline(char *str) { if (!str) { @@ -1044,8 +1026,6 @@ static int __init arch_parse_efi_cmdline(char *str) if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); - if (parse_option_str(str, "debug")) - set_bit(EFI_DBG, &efi.flags); return 0; } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 1c7380da65ff..2d66db8f80f9 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -8,6 +8,7 @@ #include <linux/memblock.h> #include <linux/bootmem.h> #include <linux/acpi.h> +#include <linux/dmi.h> #include <asm/efi.h> #include <asm/uv/uv.h> @@ -248,6 +249,16 @@ out: return ret; } +static const struct dmi_system_id sgi_uv1_dmi[] = { + { NULL, "SGI UV1", + { DMI_MATCH(DMI_PRODUCT_NAME, "Stoutland Platform"), + DMI_MATCH(DMI_PRODUCT_VERSION, "1.0"), + DMI_MATCH(DMI_BIOS_VENDOR, "SGI.COM"), + } + }, + { } /* NULL entry stops DMI scanning */ +}; + void __init efi_apply_memmap_quirks(void) { /* @@ -260,10 +271,8 @@ void __init efi_apply_memmap_quirks(void) efi_unmap_memmap(); } - /* - * UV doesn't support the new EFI pagetable mapping yet. - */ - if (is_uv_system()) + /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */ + if (dmi_check_system(sgi_uv1_dmi)) set_bit(EFI_OLD_MEMMAP, &efi.flags); } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 01d54ea766c1..90bb997ed0a2 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -61,7 +61,7 @@ enum intel_mid_timer_options intel_mid_timer_options; /* intel_mid_ops to store sub arch ops */ -struct intel_mid_ops *intel_mid_ops; +static struct intel_mid_ops *intel_mid_ops; /* getter function for sub arch ops*/ static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; @@ -138,7 +138,7 @@ static void intel_mid_arch_setup(void) intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); else { intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Unknown SoC, assuming PENWELL!\n"); + pr_info("ARCH: Unknown SoC, assuming Penwell!\n"); } out: @@ -214,12 +214,10 @@ static inline int __init setup_x86_intel_mid_timer(char *arg) else if (strcmp("lapic_and_apbt", arg) == 0) intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT; else { - pr_warn("X86 INTEL_MID timer option %s not recognised" - " use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n", - arg); + pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n", + arg); return -EINVAL; } return 0; } __setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer); - diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index ce992e8cc065..5ee360a951ce 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -197,10 +197,9 @@ static int __init sfi_parse_gpio(struct sfi_table_header *table) num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); pentry = (struct sfi_gpio_table_entry *)sb->pentry; - gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL); + gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL); if (!gpio_table) return -1; - memcpy(gpio_table, pentry, num * sizeof(*pentry)); gpio_num_entry = num; pr_debug("GPIO pin info:\n"); diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 0ee619f9fcb7..c61b6c332e97 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -111,23 +111,19 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; int ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->addr_lo); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo); if (ret) return ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->addr_hi); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi); if (ret) return ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->rmask); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask); if (ret) return ret; - return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->wmask); + return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask); } /** @@ -151,31 +147,27 @@ static int imr_write(struct imr_device *idev, u32 imr_id, local_irq_save(flags); - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++, - imr->addr_lo); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->addr_hi); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->rmask); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->wmask); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask); if (ret) goto failed; /* Lock bit must be set separately to addr_lo address bits. */ if (lock) { imr->addr_lo |= IMR_LOCK; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg - IMR_NUM_REGS, imr->addr_lo); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, + reg - IMR_NUM_REGS, imr->addr_lo); if (ret) goto failed; } @@ -228,11 +220,12 @@ static int imr_dbgfs_state_show(struct seq_file *s, void *unused) if (imr_is_enabled(&imr)) { base = imr_to_phys(imr.addr_lo); end = imr_to_phys(imr.addr_hi) + IMR_MASK; + size = end - base + 1; } else { base = 0; end = 0; + size = 0; } - size = end - base; seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx " "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i, &base, &end, size, imr.rmask, imr.wmask, @@ -587,6 +580,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) { phys_addr_t base = virt_to_phys(&_text); size_t size = virt_to_phys(&__end_rodata) - base; + unsigned long start, end; int i; int ret; @@ -594,18 +588,24 @@ static void __init imr_fixup_memmap(struct imr_device *idev) for (i = 0; i < idev->max_imr; i++) imr_clear(i); + start = (unsigned long)_text; + end = (unsigned long)__end_rodata - 1; + /* * Setup a locked IMR around the physical extent of the kernel * from the beginning of the .text secton to the end of the * .rodata section as one physically contiguous block. + * + * We don't round up @size since it is already PAGE_SIZE aligned. + * See vmlinux.lds.S for details. */ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true); if (ret < 0) { - pr_err("unable to setup IMR for kernel: (%p - %p)\n", - &_text, &__end_rodata); + pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n", + size / 1024, start, end); } else { - pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n", - size / 1024, &_text, &__end_rodata); + pr_info("protecting kernel .text - .rodata: %zu KiB (%lx - %lx)\n", + size / 1024, start, end); } } diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 5c9f63fa6abf..8dd80050d705 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -28,6 +28,7 @@ #include <linux/nmi.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/clocksource.h> #include <asm/apic.h> #include <asm/current.h> @@ -376,38 +377,42 @@ static void uv_nmi_wait(int master) atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus()); } +/* Dump Instruction Pointer header */ static void uv_nmi_dump_cpu_ip_hdr(void) { - printk(KERN_DEFAULT - "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", + pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", "CPU", "PID", "COMMAND", "IP"); } +/* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ", - cpu, current->pid, current->comm); - + pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); printk_address(regs->ip); } -/* Dump this cpu's state */ +/* + * Dump this CPU's state. If action was set to "kdump" and the crash_kexec + * failed, then we provide "dump" as an alternate action. Action "dump" now + * also includes the show "ips" (instruction pointers) action whereas the + * action "ips" only displays instruction pointers for the non-idle CPU's. + * This is an abbreviated form of the "ps" command. + */ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) { const char *dots = " ................................. "; - if (uv_nmi_action_is("ips")) { - if (cpu == 0) - uv_nmi_dump_cpu_ip_hdr(); + if (cpu == 0) + uv_nmi_dump_cpu_ip_hdr(); - if (current->pid != 0) - uv_nmi_dump_cpu_ip(cpu, regs); + if (current->pid != 0 || !uv_nmi_action_is("ips")) + uv_nmi_dump_cpu_ip(cpu, regs); - } else if (uv_nmi_action_is("dump")) { - printk(KERN_DEFAULT - "UV:%sNMI process trace for CPU %d\n", dots, cpu); + if (uv_nmi_action_is("dump")) { + pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } @@ -469,8 +474,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) uv_nmi_trigger_dump(tcpu); } if (ignored) - printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n", - ignored); + pr_alert("UV: %d CPUs ignored NMI\n", ignored); console_loglevel = saved_console_loglevel; pr_alert("UV: process trace complete\n"); @@ -492,8 +496,9 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -#if defined(CONFIG_KEXEC_CORE) static atomic_t uv_nmi_kexec_failed; + +#if defined(CONFIG_KEXEC_CORE) static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { /* Call crash to dump system state */ @@ -502,10 +507,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) crash_kexec(regs); pr_emerg("UV: crash_kexec unexpectedly returned, "); + atomic_set(&uv_nmi_kexec_failed, 1); if (!kexec_crash_image) { pr_cont("crash kernel not loaded\n"); - atomic_set(&uv_nmi_kexec_failed, 1); - uv_nmi_sync_exit(1); return; } pr_cont("kexec busy, stalling cpus while waiting\n"); @@ -514,9 +518,6 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) /* If crash exec fails the slaves should return, otherwise stall */ while (atomic_read(&uv_nmi_kexec_failed) == 0) mdelay(10); - - /* Crash kernel most likely not loaded, return in an orderly fashion */ - uv_nmi_sync_exit(0); } #else /* !CONFIG_KEXEC_CORE */ @@ -524,6 +525,7 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { if (master) pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); + atomic_set(&uv_nmi_kexec_failed, 1); } #endif /* !CONFIG_KEXEC_CORE */ @@ -613,9 +615,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) master = (atomic_read(&uv_nmi_cpu) == cpu); /* If NMI action is "kdump", then attempt to do it */ - if (uv_nmi_action_is("kdump")) + if (uv_nmi_action_is("kdump")) { uv_nmi_kdump(cpu, master, regs); + /* Unexpected return, revert action to "dump" */ + if (master) + strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action)); + } + /* Pause as all cpus enter the NMI handler */ uv_nmi_wait(master); @@ -640,6 +647,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); atomic_set(&uv_in_nmi, 0); + atomic_set(&uv_nmi_kexec_failed, 0); } uv_nmi_touch_watchdogs(); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 9ab52791fed5..d5f64996394a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -23,6 +23,7 @@ #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/mmu_context.h> +#include <linux/dmi.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -32,6 +33,29 @@ __visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; +static void msr_save_context(struct saved_context *ctxt) +{ + struct saved_msr *msr = ctxt->saved_msrs.array; + struct saved_msr *end = msr + ctxt->saved_msrs.num; + + while (msr < end) { + msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q); + msr++; + } +} + +static void msr_restore_context(struct saved_context *ctxt) +{ + struct saved_msr *msr = ctxt->saved_msrs.array; + struct saved_msr *end = msr + ctxt->saved_msrs.num; + + while (msr < end) { + if (msr->valid) + wrmsrl(msr->info.msr_no, msr->info.reg.q); + msr++; + } +} + /** * __save_processor_state - save CPU registers before creating a * hibernation image and before restoring the memory state from it @@ -111,6 +135,7 @@ static void __save_processor_state(struct saved_context *ctxt) #endif ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, &ctxt->misc_enable); + msr_save_context(ctxt); } /* Needed by apm.c */ @@ -229,6 +254,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); + msr_restore_context(ctxt); } /* Needed by apm.c */ @@ -320,3 +346,69 @@ static int __init bsp_pm_check_init(void) } core_initcall(bsp_pm_check_init); + +static int msr_init_context(const u32 *msr_id, const int total_num) +{ + int i = 0; + struct saved_msr *msr_array; + + if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) { + pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n"); + return -EINVAL; + } + + msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL); + if (!msr_array) { + pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n"); + return -ENOMEM; + } + + for (i = 0; i < total_num; i++) { + msr_array[i].info.msr_no = msr_id[i]; + msr_array[i].valid = false; + msr_array[i].info.reg.q = 0; + } + saved_context.saved_msrs.num = total_num; + saved_context.saved_msrs.array = msr_array; + + return 0; +} + +/* + * The following section is a quirk framework for problematic BIOSen: + * Sometimes MSRs are modified by the BIOSen after suspended to + * RAM, this might cause unexpected behavior after wakeup. + * Thus we save/restore these specified MSRs across suspend/resume + * in order to work around it. + * + * For any further problematic BIOSen/platforms, + * please add your own function similar to msr_initialize_bdw. + */ +static int msr_initialize_bdw(const struct dmi_system_id *d) +{ + /* Add any extra MSR ids into this array. */ + u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; + + pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); + return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); +} + +static struct dmi_system_id msr_save_dmi_table[] = { + { + .callback = msr_initialize_bdw, + .ident = "BROADWELL BDX_EP", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"), + DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"), + }, + }, + {} +}; + +static int pm_check_save_msr(void) +{ + dmi_check_system(msr_save_dmi_table); + return 0; +} + +device_initcall(pm_check_save_msr); diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 10fea5fc821e..df280da34825 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,11 +1,9 @@ config AMD_MCE_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS + depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different aspects of the MCE handling code. WARNING: Do not even assume this interface is staying stable! - - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 17e35b5bf779..55d38cfa46c2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -17,7 +17,11 @@ #include <linux/cpu.h> #include <linux/string.h> #include <linux/uaccess.h> +#include <linux/pci.h> + #include <asm/mce.h> +#include <asm/amd_nb.h> +#include <asm/irq_vectors.h> #include "../kernel/cpu/mcheck/mce-internal.h" @@ -30,16 +34,21 @@ static struct dentry *dfs_inj; static u8 n_banks; #define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 enum injection_type { SW_INJ = 0, /* SW injection, simply decode the error */ HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ N_INJ_TYPES, }; static const char * const flags_options[] = { [SW_INJ] = "sw", [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", NULL }; @@ -129,12 +138,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, { char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - size_t ret; if (cnt > MAX_FLAG_OPT_SIZE) - cnt = MAX_FLAG_OPT_SIZE; - - ret = cnt; + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -150,9 +156,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, return err; } - *ppos += ret; + *ppos += cnt; - return ret; + return cnt; } static const struct file_operations flags_fops = { @@ -185,6 +191,55 @@ static void trigger_mce(void *info) asm volatile("int $18"); } +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct pci_dev *F3 = node_to_amd_nb(nid)->misc; + u32 val; + int err; + + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + static void do_inject(void) { u64 mcg_status = 0; @@ -205,6 +260,26 @@ static void do_inject(void) if (!(i_mce.status & MCI_STATUS_PCC)) mcg_status |= MCG_STATUS_RIPV; + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } + + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + get_online_cpus(); if (!cpu_online(cpu)) goto err; @@ -225,7 +300,16 @@ static void do_inject(void) toggle_hw_mce_inject(cpu, false); - smp_call_function_single(cpu, trigger_mce, NULL, 0); + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } err: put_online_cpus(); @@ -290,6 +374,11 @@ static const char readme_msg[] = "\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" "\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" "\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" "\n"; static ssize_t diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2730d775ef9a..3e75fcf6b836 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -70,3 +70,4 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index a8fecc226946..3ee2bb6b440b 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ ifeq ($(CONFIG_X86_32),y) obj-y += checksum_32.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 755481f14d90..174781a404ff 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -36,13 +36,6 @@ #endif /* CONFIG_X86_PPRO_FENCE */ #define dma_wmb() barrier() -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() - -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) - -#define read_barrier_depends() do { } while (0) -#define smp_read_barrier_depends() do { } while (0) +#include <asm-generic/barrier.h> #endif diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 9fe77b7b5a0e..11ab90dc5f14 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -1,8 +1,13 @@ #ifndef __UM_ASM_SYSCALL_H #define __UM_ASM_SYSCALL_H +#include <asm/syscall-generic.h> #include <uapi/linux/audit.h> +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + static inline int syscall_get_arch(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a29756f2d940..47c78d5e5c32 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -68,6 +68,7 @@ static const int reg_offsets[] = { [EFL] = HOST_EFLAGS, [UESP] = HOST_SP, [SS] = HOST_SS, + [ORIG_EAX] = HOST_ORIG_AX, }; int putreg(struct task_struct *child, int regno, unsigned long value) @@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + case ORIG_EAX: break; case FS: if (value && (value & 3) != 3) @@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value) value &= FLAG_MASK; child->thread.regs.regs.gp[HOST_EFLAGS] |= value; return 0; - case ORIG_EAX: - child->thread.regs.regs.syscall = value; - return 0; default : panic("Bad register in putreg() : %d\n", regno); } @@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno) regno >>= 2; switch (regno) { - case ORIG_EAX: - return child->thread.regs.regs.syscall; case FS: case GS: case DS: @@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno) case EDI: case EBP: case EFL: + case ORIG_EAX: break; default: panic("Bad register in getreg() : %d\n", regno); diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 06934a8a4872..14fcd01ed992 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs, if (err) return 1; - err = convert_fxsr_from_user(&fpx, sc.fpstate); + err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate); if (err) return 1; @@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs, { struct user_i387_struct fp; - err = copy_from_user(&fp, sc.fpstate, + err = copy_from_user(&fp, (void *)sc.fpstate, sizeof(struct user_i387_struct)); if (err) return 1; @@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, #endif #undef PUTREG sc.oldmask = mask; - sc.fpstate = to_fp; + sc.fpstate = (unsigned long)to_fp; err = copy_to_user(to, &sc, sizeof(struct sigcontext)); if (err) @@ -468,12 +468,10 @@ long sys_sigreturn(void) struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); sigset_t set; struct sigcontext __user *sc = &frame->sc; - unsigned long __user *oldmask = &sc->oldmask; - unsigned long __user *extramask = frame->extramask; int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); - if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || - copy_from_user(&set.sig[1], extramask, sig_size)) + if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], frame->extramask, sig_size)) goto segfault; set_current_blocked(&set); @@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, { struct rt_sigframe __user *frame; int err = 0, sig = ksig->sig; + unsigned long fp_to; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); @@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); - err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + + fp_to = (unsigned long)&frame->fpstate; + + err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S index b972649d3a18..98816804e131 100644 --- a/arch/x86/um/stub_32.S +++ b/arch/x86/um/stub_32.S @@ -1,6 +1,5 @@ #include <as-layout.h> - .globl syscall_stub .section .__syscall_stub, "ax" .globl batch_syscall_stub diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index 7160b20172d0..ba914b3b8cc4 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -1,25 +1,9 @@ #include <as-layout.h> - .globl syscall_stub .section .__syscall_stub, "ax" -syscall_stub: - syscall - /* We don't have 64-bit constants, so this constructs the address - * we need. - */ - movq $(STUB_DATA >> 32), %rbx - salq $32, %rbx - movq $(STUB_DATA & 0xffffffff), %rcx - or %rcx, %rbx - movq %rax, (%rbx) - int3 - .globl batch_syscall_stub batch_syscall_stub: - mov $(STUB_DATA >> 32), %rbx - sal $32, %rbx - mov $(STUB_DATA & 0xffffffff), %rax - or %rax, %rbx + mov $(STUB_DATA), %rbx /* load pointer to first operation */ mov %rbx, %rsp add $0x10, %rsp diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index bd16d6c370ec..439c0994b696 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,6 +7,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <generated/user_constants.h> +#include <asm/syscall.h> #define __NO_STUBS @@ -24,15 +25,13 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index a75d8700472a..b74ea6c2c0e7 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,6 +7,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <generated/user_constants.h> +#include <asm/syscall.h> #define __NO_STUBS @@ -37,15 +38,13 @@ #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) #define __SYSCALL_X32(nr, sym, compat) /* Not supported */ -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include <asm/syscalls_64.h> #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, -typedef void (*sys_call_ptr_t)(void); - -extern void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index acda713ab5be..abf4901c917b 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg) if (reg != APIC_ID) return 0; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return 0; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 993b7a71386d..d09e4c9d7cc5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,6 +75,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/pat.h> +#include <asm/cpu.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -414,7 +415,7 @@ static bool __init xen_check_mwait(void) set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - if ((HYPERVISOR_dom0_op(&op) == 0) && + if ((HYPERVISOR_platform_op(&op) == 0) && (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; @@ -1191,7 +1192,7 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - + .features = 0, .name = "Xen", }; @@ -1228,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .iret = xen_iret, #ifdef CONFIG_X86_64 - .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, -#else - .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, @@ -1264,12 +1262,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .end_context_switch = xen_end_context_switch, }; -static const struct pv_apic_ops xen_apic_ops __initconst = { -#ifdef CONFIG_X86_LOCAL_APIC - .startup_ipi_hook = paravirt_nop, -#endif -}; - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1373,7 +1365,7 @@ static void __init xen_boot_params_init_edd(void) info->params.length = sizeof(info->params); set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, &info->params); - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; @@ -1391,7 +1383,7 @@ static void __init xen_boot_params_init_edd(void) op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { op.u.firmware_info.index = nr; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; @@ -1534,8 +1526,9 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; + if (xen_initial_domain()) + pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; - pv_apic_ops = xen_apic_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1697,7 +1690,7 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; - if (HYPERVISOR_dom0_op(&op) == 0) + if (HYPERVISOR_platform_op(&op) == 0) boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; /* Make sure ACS will be enabled */ @@ -1885,8 +1878,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static void xen_set_cpu_features(struct cpuinfo_x86 *c) { - if (xen_pv_domain()) + if (xen_pv_domain()) { clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + set_cpu_cap(c, X86_FEATURE_XENPV); + } } const struct hypervisor_x86 x86_hyper_xen = { @@ -1899,3 +1894,17 @@ const struct hypervisor_x86 x86_hyper_xen = { .set_cpu_features = xen_set_cpu_features, }; EXPORT_SYMBOL(x86_hyper_xen); + +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num) +{ + arch_register_cpu(num); +} +EXPORT_SYMBOL(xen_arch_register_cpu); + +void xen_arch_unregister_cpu(int num) +{ + arch_unregister_cpu(num); +} +EXPORT_SYMBOL(xen_arch_unregister_cpu); +#endif diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 1580e7a5a4cf..e079500b17f3 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void) kfree(pages); return -ENOMEM; } - rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + rc = alloc_xenballooned_pages(nr_grant_frames, pages); if (rc) { pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c479fe40459..c913ca4f6958 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2436,7 +2436,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_others = xen_flush_tlb_others, .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, @@ -2495,14 +2494,9 @@ void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; - /* Optimization - we can use the HVM one but it has no idea which - * VCPUs are descheduled - which means that it will needlessly IPI - * them. Xen knows so let it do the job. - */ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + if (xen_feature(XENFEAT_auto_translated_physmap)) return; - } + pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2888,6 +2882,7 @@ static int do_remap_gfn(struct vm_area_struct *vma, addr += range; if (err_ptr) err_ptr += batch; + cond_resched(); } out: diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 660b3cfef234..cab9f766bb06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) * the new pages are installed with cmpxchg; if we lose the race then * simply free the page we allocated and use the one that's there. */ -static bool alloc_p2m(unsigned long pfn) +int xen_alloc_p2m_entry(unsigned long pfn) { unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; @@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn) /* PMD level is missing, allocate a new one */ ptep = alloc_p2m_pmd(addr, pte_pg); if (!ptep) - return false; + return -ENOMEM; } if (p2m_top_mfn && pfn < MAX_P2M_PFN) { @@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn) mid_mfn = alloc_p2m_page(); if (!mid_mfn) - return false; + return -ENOMEM; p2m_mid_mfn_init(mid_mfn, p2m_missing); @@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn) p2m = alloc_p2m_page(); if (!p2m) - return false; + return -ENOMEM; if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); @@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn) HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } - return true; + return 0; } +EXPORT_SYMBOL(xen_alloc_p2m_entry); unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) @@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) + int ret; + + ret = xen_alloc_p2m_entry(pfn); + if (ret < 0) return false; return __set_phys_to_machine(pfn, mfn); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1c30e4ab1022..7ab29518a3b9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after this */ - if (e_pfn < *min_pfn) + if (e_pfn <= *min_pfn) continue; s_pfn = PFN_UP(entry->addr); @@ -829,6 +829,8 @@ char * __init xen_memory_setup(void) addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { + bool discard = false; + chunk_size = size; type = xen_e820_map[i].type; @@ -843,10 +845,11 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - type = E820_UNUSABLE; + discard = true; } - xen_align_and_add_e820_region(addr, chunk_size, type); + if (!discard) + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; @@ -965,17 +968,8 @@ char * __init xen_auto_xlated_memory_setup(void) static void __init fiddle_vdso(void) { #ifdef CONFIG_X86_32 - /* - * This could be called before selected_vdso32 is initialized, so - * just fiddle with both possible images. vdso_image_32_syscall - * can't be selected, since it only exists on 64-bit systems. - */ - u32 *mask; - mask = vdso_image_32_int80.data + - vdso_image_32_int80.sym_VDSO32_NOTE_MASK; - *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; - mask = vdso_image_32_sysenter.data + - vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; + u32 *mask = vdso_image_32.data + + vdso_image_32.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index feddabdab448..7f664c416faf 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,6 +1,7 @@ #include <linux/types.h> #include <linux/tick.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> #include <xen/events.h> @@ -33,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + if (!suspend_cancelled) + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { @@ -68,26 +70,16 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - int cpu; - - for_each_online_cpu(cpu) - xen_pmu_finish(cpu); - if (xen_pv_domain()) xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - int cpu; - if (xen_pv_domain()) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); - - for_each_online_cpu(cpu) - xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) @@ -106,10 +98,20 @@ static void xen_vcpu_notify_suspend(void *data) void xen_arch_resume(void) { + int cpu; + on_each_cpu(xen_vcpu_notify_restore, NULL, 1); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } void xen_arch_suspend(void) { + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index f1ba6a092854..a0a4e554c6f1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/pvclock_gtod.h> +#include <linux/timekeeper_internal.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> @@ -32,86 +33,12 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -/* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); - /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -/* return an consistent snapshot of 64-bit time/counter value */ -static u64 get64(const u64 *p) -{ - u64 ret; - - if (BITS_PER_LONG < 64) { - u32 *p32 = (u32 *)p; - u32 h, l; - - /* - * Read high then low, and then make sure high is - * still the same; this will only loop if low wraps - * and carries into high. - * XXX some clean way to make this endian-proof? - */ - do { - h = p32[1]; - barrier(); - l = p32[0]; - barrier(); - } while (p32[1] != h); - - ret = (((u64)h) << 32) | l; - } else - ret = *p; - - return ret; -} - -/* - * Runstate accounting - */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) -{ - u64 state_time; - struct vcpu_runstate_info *state; - - BUG_ON(preemptible()); - - state = this_cpu_ptr(&xen_runstate); - - /* - * The runstate info is always updated by the hypervisor on - * the current CPU, so there's no need to use anything - * stronger than a compiler barrier when fetching it. - */ - do { - state_time = get64(&state->state_entry_time); - barrier(); - *res = *state; - barrier(); - } while (get64(&state->state_entry_time) != state_time); -} - -/* return true when a vcpu could run but has no real cpu to run on */ -bool xen_vcpu_stolen(int vcpu) -{ - return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; -} - -void xen_setup_runstate_info(int cpu) -{ - struct vcpu_register_runstate_memory_area area; - - area.addr.v = &per_cpu(xen_runstate, cpu); - - if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, - cpu, &area)) - BUG(); -} - static void do_stolen_accounting(void) { struct vcpu_runstate_info state; @@ -119,7 +46,7 @@ static void do_stolen_accounting(void) s64 runnable, offline, stolen; cputime_t ticks; - get_runstate_snapshot(&state); + xen_get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); @@ -194,26 +121,46 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, unsigned long was_set, void *priv) { /* Protected by the calling core code serialization */ - static struct timespec next_sync; + static struct timespec64 next_sync; struct xen_platform_op op; - struct timespec now; + struct timespec64 now; + struct timekeeper *tk = priv; + static bool settime64_supported = true; + int ret; - now = __current_kernel_time(); + now.tv_sec = tk->xtime_sec; + now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); /* * We only take the expensive HV call when the clock was set * or when the 11 minutes RTC synchronization time elapsed. */ - if (!was_set && timespec_compare(&now, &next_sync) < 0) + if (!was_set && timespec64_compare(&now, &next_sync) < 0) return NOTIFY_OK; - op.cmd = XENPF_settime; - op.u.settime.secs = now.tv_sec; - op.u.settime.nsecs = now.tv_nsec; - op.u.settime.system_time = xen_clocksource_read(); +again: + if (settime64_supported) { + op.cmd = XENPF_settime64; + op.u.settime64.mbz = 0; + op.u.settime64.secs = now.tv_sec; + op.u.settime64.nsecs = now.tv_nsec; + op.u.settime64.system_time = xen_clocksource_read(); + } else { + op.cmd = XENPF_settime32; + op.u.settime32.secs = now.tv_sec; + op.u.settime32.nsecs = now.tv_nsec; + op.u.settime32.system_time = xen_clocksource_read(); + } + + ret = HYPERVISOR_platform_op(&op); - (void)HYPERVISOR_dom0_op(&op); + if (ret == -ENOSYS && settime64_supported) { + settime64_supported = false; + goto again; + } + if (ret < 0) + return NOTIFY_BAD; /* * Move the next drift compensation time 11 minutes diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index fd92a64d748e..feb6d40a0860 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -35,20 +35,6 @@ check_events: ret /* - * We can't use sysexit directly, because we're not running in ring0. - * But we can easily fake it up using iret. Assuming xen_sysexit is - * jumped to with a standard stack frame, we can just strip it back to - * a standard iret frame and use iret. - */ -ENTRY(xen_sysexit) - movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ - orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) - lea PT_EIP(%esp), %esp - - jmp xen_iret -ENDPROC(xen_sysexit) - -/* * This is run where a normal iret would be run, with the same stack setup: * 8: eflags * 4: cs diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index f22667abf7b9..cc8acc410ddb 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -68,25 +68,6 @@ ENTRY(xen_sysret64) ENDPATCH(xen_sysret64) RELOC(xen_sysret64, 1b+1) -ENTRY(xen_sysret32) - /* - * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - pushq $__USER32_DS - pushq PER_CPU_VAR(rsp_scratch) - pushq %r11 - pushq $__USER32_CS - pushq %rcx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysret32) -RELOC(xen_sysret32, 1b+1) - /* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1399423f3418..4140b070f2e9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -#ifdef CONFIG_X86_32 -__visible void xen_sysexit(void); -#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); |