diff options
author | Arnd Bergmann <arnd@arndb.de> | 2012-10-04 22:57:00 +0200 |
---|---|---|
committer | Arnd Bergmann <arnd@arndb.de> | 2012-10-04 22:57:51 +0200 |
commit | c37d6154c0b9163c27e53cc1d0be3867b4abd760 (patch) | |
tree | 7a24522c56d1cb284dff1d3c225bbdaba0901bb5 /arch/x86 | |
parent | e7a570ff7dff9af6e54ff5e580a61ec7652137a0 (diff) | |
parent | 8a1ab3155c2ac7fbe5f2038d6e26efeb607a1498 (diff) | |
download | blackbird-op-linux-c37d6154c0b9163c27e53cc1d0be3867b4abd760.tar.gz blackbird-op-linux-c37d6154c0b9163c27e53cc1d0be3867b4abd760.zip |
Merge branch 'disintegrate-asm-generic' of git://git.infradead.org/users/dhowells/linux-headers into asm-generic
Patches from David Howells <dhowells@redhat.com>:
This is to complete part of the UAPI disintegration for which the
preparatory patches were pulled recently.
Note that there are some fixup patches which are at the base of the
branch aimed at you, plus all arches get the asm-generic branch merged in too.
* 'disintegrate-asm-generic' of git://git.infradead.org/users/dhowells/linux-headers:
UAPI: (Scripted) Disintegrate include/asm-generic
UAPI: Fix conditional header installation handling (notably kvm_para.h on m68k)
c6x: remove c6x signal.h
UAPI: Split compound conditionals containing __KERNEL__ in Arm64
UAPI: Fix the guards on various asm/unistd.h files
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Diffstat (limited to 'arch/x86')
151 files changed, 3641 insertions, 2074 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50a1d1f9b6d3..7f9a395c5254 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -7,11 +7,13 @@ config 64BIT Say no to build a 32-bit kernel - formerly known as i386 config X86_32 - def_bool !64BIT + def_bool y + depends on !64BIT select CLKSRC_I8253 config X86_64 - def_bool 64BIT + def_bool y + depends on 64BIT select X86_DEV_DMA_OPS ### Arch settings @@ -36,6 +38,7 @@ config X86 select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER @@ -60,6 +63,8 @@ config X86 select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 @@ -97,9 +102,12 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER + select HAVE_RCU_USER_QS if X86_64 + select HAVE_IRQ_TIME_ACCOUNTING config INSTRUCTION_DECODER - def_bool (KPROBES || PERF_EVENTS || UPROBES) + def_bool y + depends on KPROBES || PERF_EVENTS || UPROBES config OUTPUT_FORMAT string @@ -127,13 +135,15 @@ config SBUS bool config NEED_DMA_MAP_STATE - def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG) + def_bool y + depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG config NEED_SG_DMA_LENGTH def_bool y config GENERIC_ISA_DMA - def_bool ISA_DMA_API + def_bool y + depends on ISA_DMA_API config GENERIC_BUG def_bool y @@ -150,13 +160,16 @@ config GENERIC_GPIO bool config ARCH_MAY_HAVE_PC_FDC - def_bool ISA_DMA_API + def_bool y + depends on ISA_DMA_API config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + def_bool y + depends on !X86_XADD config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + def_bool y + depends on X86_XADD config GENERIC_CALIBRATE_DELAY def_bool y @@ -752,7 +765,8 @@ config SWIOTLB If unsure, say Y. config IOMMU_HELPER - def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) + def_bool y + depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" @@ -796,17 +810,6 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. -config IRQ_TIME_ACCOUNTING - bool "Fine granularity task level IRQ time accounting" - default n - ---help--- - Select this option to enable fine granularity task irq time - accounting. This is done by reading a timestamp on each - transitions between softirq and hardirq state, so there can be a - small performance impact. - - If in doubt, say N here. - source "kernel/Kconfig.preempt" config X86_UP_APIC @@ -871,6 +874,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" + default y ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). @@ -982,25 +986,25 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "/dev/cpu/microcode - microcode support" + tristate "CPU microcode loading support" select FW_LOADER ---help--- + If you say Y here, you will be able to update the microcode on certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, - Pentium 4, Xeon etc. The AMD support is for family 0x10 and - 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. - You will obviously need the actual microcode binary data itself - which is not shipped with the Linux kernel. + IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, + Xeon etc. The AMD support is for families 0x10 and later. You will + obviously need the actual microcode binary data itself which is not + shipped with the Linux kernel. This option selects the general module only, you need to select at least one vendor specific module as well. - To compile this driver as a module, choose M here: the - module will be called microcode. + To compile this driver as a module, choose M here: the module + will be called microcode. config MICROCODE_INTEL - bool "Intel microcode patch loading support" + bool "Intel microcode loading support" depends on MICROCODE default MICROCODE select FW_LOADER @@ -1013,7 +1017,7 @@ config MICROCODE_INTEL <http://www.urbanmyth.org/microcode/>. config MICROCODE_AMD - bool "AMD microcode patch loading support" + bool "AMD microcode loading support" depends on MICROCODE select FW_LOADER ---help--- @@ -1159,10 +1163,12 @@ config X86_PAE consumes more pagetable space per process. config ARCH_PHYS_ADDR_T_64BIT - def_bool X86_64 || X86_PAE + def_bool y + depends on X86_64 || X86_PAE config ARCH_DMA_ADDR_T_64BIT - def_bool X86_64 || HIGHMEM64G + def_bool y + depends on X86_64 || HIGHMEM64G config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EXPERT @@ -1285,8 +1291,8 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool X86_64 - depends on MEMORY_HOTPLUG + def_bool y + depends on X86_64 && MEMORY_HOTPLUG config ARCH_PROC_KCORE_TEXT def_bool y @@ -1487,6 +1493,17 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. +config X86_SMAP + def_bool y + prompt "Supervisor Mode Access Prevention" if EXPERT + ---help--- + Supervisor Mode Access Prevention (SMAP) is a security + feature in newer Intel processors. There is a small + performance cost if this enabled and turned on; there is + also a small increase in the kernel size if this is enabled. + + If unsure, say Y. + config EFI bool "EFI runtime service support" depends on ACPI @@ -1975,7 +1992,6 @@ config PCI_MMCONFIG config PCI_CNB20LE_QUIRK bool "Read CNB20LE Host Bridge Windows" if EXPERT - default n depends on PCI && EXPERIMENTAL help Read the PCI windows out of the CNB20LE host bridge. This allows @@ -2186,18 +2202,18 @@ config COMPAT depends on IA32_EMULATION || X86_X32 select ARCH_WANT_OLD_COMPAT_IPC +if COMPAT config COMPAT_FOR_U64_ALIGNMENT - def_bool COMPAT - depends on X86_64 + def_bool y config SYSVIPC_COMPAT def_bool y - depends on COMPAT && SYSVIPC + depends on SYSVIPC config KEYS_COMPAT - bool - depends on COMPAT && KEYS - default y + def_bool y + depends on KEYS +endif endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 706e12e9984b..f3b86d0df44e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -306,7 +306,8 @@ config X86_INTERNODE_CACHE_SHIFT default X86_L1_CACHE_SHIFT config X86_CMPXCHG - def_bool X86_64 || (X86_32 && !M386) + def_bool y + depends on X86_64 || (X86_32 && !M386) config X86_L1_CACHE_SHIFT int @@ -317,7 +318,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_64 || !M386 + depends on !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index f7535bedc33f..ce03476d8c8f 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -37,7 +37,7 @@ setup-y += video-bios.o targets += $(setup-y) hostprogs-y := mkcpustr tools/build -HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(LINUXINCLUDE) \ +HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(USERINCLUDE) \ -D__EXPORTED_HEADERS__ $(obj)/cpu.o: $(obj)/cpustr.h @@ -52,7 +52,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ +KBUILD_CFLAGS := $(USERINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e398bb5d63bb..8a84501acb1b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -28,6 +28,9 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ $(obj)/piggy.o +$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone +$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone + ifeq ($(CONFIG_EFI_STUB), y) VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b3e0227df2c9..c760e073963e 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -276,8 +276,9 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, nr_gops = size / sizeof(void *); for (i = 0; i < nr_gops; i++) { struct efi_graphics_output_mode_info *info; - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - void *pciio; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy; void *h = gop_handle[i]; status = efi_call_phys3(sys_table->boottime->handle_protocol, @@ -285,19 +286,21 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, if (status != EFI_SUCCESS) continue; - efi_call_phys3(sys_table->boottime->handle_protocol, - h, &pciio_proto, &pciio); + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, &conout_proto, &dummy); + + if (status == EFI_SUCCESS) + conout_found = true; status = efi_call_phys4(gop->query_mode, gop, gop->mode->mode, &size, &info); - if (status == EFI_SUCCESS && (!first_gop || pciio)) { + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* - * Apple provide GOPs that are not backed by - * real hardware (they're used to handle - * multiple displays). The workaround is to - * search for a GOP implementing the PCIIO - * protocol, and if one isn't found, to just - * fallback to the first GOP. + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. */ width = info->horizontal_resolution; height = info->vertical_resolution; @@ -308,10 +311,10 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, pixels_per_scan_line = info->pixels_per_scan_line; /* - * Once we've found a GOP supporting PCIIO, + * Once we've found a GOP supporting ConOut, * don't bother looking any further. */ - if (pciio) + if (conout_found) break; first_gop = gop; @@ -328,7 +331,6 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; - si->lfb_size = fb_size; si->pages = 1; if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { @@ -376,6 +378,10 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, si->rsvd_pos = 0; } + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; + free_handle: efi_call_phys1(sys_table->boottime->free_pool, gop_handle); return status; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 3b6e15627c55..e5b0a8f91c5f 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -14,6 +14,10 @@ #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) #define EFI_READ_CHUNK_SIZE (1024 * 1024) +#define EFI_CONSOLE_OUT_DEVICE_GUID \ + EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ + 0x3f, 0xc1, 0x4d) + #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 #define PIXEL_BIT_MASK 2 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b4e15dd6786a..2a017441b8b2 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -32,10 +32,6 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ #define SVGA_MODE ASK_VGA #endif -#ifndef RAMDISK -#define RAMDISK 0 -#endif - #ifndef ROOT_RDONLY #define ROOT_RDONLY 1 #endif diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 919257f526f2..4579eff0ef4d 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -15,6 +15,8 @@ #include <stdio.h> +#include "../include/asm/required-features.h" +#include "../include/asm/cpufeature.h" #include "../kernel/cpu/capflags.c" int main(void) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 119db67dcb03..5598547281a7 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y CONFIG_SUN_PARTITION=y CONFIG_KARMA_PARTITION=y CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_BLK_DEV_SR_VENDOR=y @@ -231,8 +229,6 @@ CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y CONFIG_HID_GYRATION=y CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y @@ -243,11 +239,11 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -262,10 +258,9 @@ CONFIG_RTC_CLASS=y CONFIG_DMADEVICES=y CONFIG_EEEPC_LAPTOP=y CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -280,7 +275,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y -CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y @@ -299,13 +293,11 @@ CONFIG_DEBUG_KERNEL=y CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y CONFIG_KEYS_DEBUG_PROC_KEYS=y @@ -316,4 +308,3 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_CRYPTO_AES_586=y # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 76eb2903809f..671524d0f6c0 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y CONFIG_SUN_PARTITION=y CONFIG_KARMA_PARTITION=y CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_BLK_DEV_SR_VENDOR=y @@ -227,8 +225,6 @@ CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y CONFIG_HID_GYRATION=y CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y @@ -239,11 +235,11 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -262,10 +258,9 @@ CONFIG_AMD_IOMMU_STATS=y CONFIG_INTEL_IOMMU=y # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -280,7 +275,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y -CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y @@ -298,13 +292,11 @@ CONFIG_DEBUG_KERNEL=y CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y CONFIG_KEYS_DEBUG_PROC_KEYS=y @@ -314,4 +306,3 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 673ac9b63d6b..efc6a958b71d 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -32,6 +32,7 @@ #include <asm/sigframe.h> #include <asm/sighandling.h> #include <asm/sys_ia32.h> +#include <asm/smap.h> #define FIX_EFLAGS __FIX_EFLAGS @@ -162,7 +163,8 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, } seg = get_fs(); set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); + ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), + (stack_t __force __user *) &uoss, regs->sp); set_fs(seg); if (ret >= 0 && uoss_ptr) { if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t))) @@ -250,11 +252,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, get_user_ex(tmp, &sc->fpstate); buf = compat_ptr(tmp); - err |= restore_i387_xstate_ia32(buf); get_user_ex(*pax, &sc->ax); } get_user_catch(err); + err |= restore_xstate_sig(buf, 1); + return err; } @@ -361,7 +364,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, */ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - void **fpstate) + void __user **fpstate) { unsigned long sp; @@ -381,9 +384,12 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, sp = (unsigned long) ka->sa.sa_restorer; if (used_math()) { - sp = sp - sig_xstate_ia32_size; - *fpstate = (struct _fpstate_ia32 *) sp; - if (save_i387_xstate_ia32(*fpstate) < 0) + unsigned long fx_aligned, math_size; + + sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); + *fpstate = (struct _fpstate_ia32 __user *) sp; + if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, + math_size) < 0) return (void __user *) -1L; } @@ -448,7 +454,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, * These are actually not used anymore, but left because some * gdb versions depend on them as a marker. */ - put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); if (err) @@ -502,7 +508,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sig, &frame->sig); put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo); put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); - err |= copy_siginfo_to_user32(&frame->info, info); /* Create the ucontext. */ if (cpu_has_xsave) @@ -514,9 +519,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -529,9 +531,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Not actually used anymore, but left because some gdb * versions need it. */ - put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); + err |= copy_siginfo_to_user32(&frame->info, info); + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20e5f7ba0e6b..9c289504e680 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include <asm/segment.h> #include <asm/irqflags.h> #include <asm/asm.h> +#include <asm/smap.h> #include <linux/linkage.h> #include <linux/err.h> @@ -146,8 +147,10 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,1,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ + ASM_STAC 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -301,8 +304,10 @@ ENTRY(ia32_cstar_target) /* no need to do an access_ok check here because r8 has been 32bit zero extended */ /* hardware stack frame is complete now */ + ASM_STAC 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -365,6 +370,7 @@ cstar_tracesys: END(ia32_cstar_target) ia32_badarg: + ASM_CLAC movq $-EFAULT,%rax jmp ia32_sysret CFI_ENDPROC diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 4540bece0946..c5b938d92eab 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,7 +287,7 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, return ret; } -asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, int options) { return compat_sys_wait4(pid, stat_addr, options, NULL); diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 952bd0100c5c..372231c22a47 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_ALTERNATIVE_ASM_H +#define _ASM_X86_ALTERNATIVE_ASM_H + #ifdef __ASSEMBLY__ #include <asm/asm.h> @@ -5,10 +8,10 @@ #ifdef CONFIG_SMP .macro LOCK_PREFIX 672: lock - .section .smp_locks,"a" + .pushsection .smp_locks,"a" .balign 4 .long 672b - . - .previous + .popsection .endm #else .macro LOCK_PREFIX @@ -24,3 +27,5 @@ .endm #endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 70780689599a..58ed6d96a6ac 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -29,10 +29,10 @@ #ifdef CONFIG_SMP #define LOCK_PREFIX_HERE \ - ".section .smp_locks,\"a\"\n" \ - ".balign 4\n" \ - ".long 671f - .\n" /* offset */ \ - ".previous\n" \ + ".pushsection .smp_locks,\"a\"\n" \ + ".balign 4\n" \ + ".long 671f - .\n" /* offset */ \ + ".popsection\n" \ "671:" #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " @@ -60,7 +60,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); -extern void alternatives_smp_switch(int smp); +extern void alternatives_enable_smp(void); extern int alternatives_text_reserved(void *start, void *end); extern bool skip_smp_alternatives; #else @@ -68,7 +68,7 @@ static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} -static inline void alternatives_smp_switch(int smp) {} +static inline void alternatives_enable_smp(void) {} static inline int alternatives_text_reserved(void *start, void *end) { return 0; @@ -99,30 +99,30 @@ static inline int alternatives_text_reserved(void *start, void *end) /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ OLDINSTR(oldinstr) \ - ".section .altinstructions,\"a\"\n" \ + ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ + ".popsection\n" \ + ".pushsection .discard,\"aw\",@progbits\n" \ DISCARD_ENTRY(1) \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".previous" + ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR(oldinstr) \ - ".section .altinstructions,\"a\"\n" \ + ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ + ".popsection\n" \ + ".pushsection .discard,\"aw\",@progbits\n" \ DISCARD_ENTRY(1) \ DISCARD_ENTRY(2) \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".previous" + ".popsection" /* * This must be included *after* the definition of ALTERNATIVE due to diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 58cb6d4085f7..250b8774c158 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -309,9 +309,9 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) #define smp_mb__after_atomic_inc() barrier() #ifdef CONFIG_X86_32 -# include "atomic64_32.h" +# include <asm/atomic64_32.h> #else -# include "atomic64_64.h" +# include <asm/atomic64_64.h> #endif #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 72f5009deb5a..6dfd0195bb55 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -355,7 +355,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); */ static inline unsigned long __ffs(unsigned long word) { - asm("bsf %1,%0" + asm("rep; bsf %1,%0" : "=r" (word) : "rm" (word)); return word; @@ -369,7 +369,7 @@ static inline unsigned long __ffs(unsigned long word) */ static inline unsigned long ffz(unsigned long word) { - asm("bsf %1,%0" + asm("rep; bsf %1,%0" : "=r" (word) : "r" (~word)); return word; @@ -417,10 +417,9 @@ static inline int ffs(int x) * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ - long tmp = -1; asm("bsfl %1,%0" : "=r" (r) - : "rm" (x), "0" (tmp)); + : "rm" (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" @@ -459,10 +458,9 @@ static inline int fls(int x) * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ - long tmp = -1; asm("bsrl %1,%0" : "=r" (r) - : "rm" (x), "0" (tmp)); + : "rm" (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" @@ -490,13 +488,13 @@ static inline int fls(int x) #ifdef CONFIG_X86_64 static __always_inline int fls64(__u64 x) { - long bitpos = -1; + int bitpos = -1; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before. */ - asm("bsrq %1,%0" + asm("bsrq %1,%q0" : "+r" (bitpos) : "rm" (x)); return bitpos + 1; diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index a9e3a740f697..0fa675033912 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,41 +46,39 @@ For 32-bit we have the following conventions - kernel is built with */ -#include "dwarf2.h" +#include <asm/dwarf2.h> /* - * 64-bit system call stack frame layout defines and helpers, for - * assembly code (note that the seemingly unnecessary parentheses - * are to prevent cpp from inserting spaces in expressions that get - * passed to macros): + * 64-bit system call stack frame layout defines and helpers, + * for assembly code: */ -#define R15 (0) -#define R14 (8) -#define R13 (16) -#define R12 (24) -#define RBP (32) -#define RBX (40) +#define R15 0 +#define R14 8 +#define R13 16 +#define R12 24 +#define RBP 32 +#define RBX 40 /* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 (48) -#define R10 (56) -#define R9 (64) -#define R8 (72) -#define RAX (80) -#define RCX (88) -#define RDX (96) -#define RSI (104) -#define RDI (112) -#define ORIG_RAX (120) /* + error_code */ +#define R11 48 +#define R10 56 +#define R9 64 +#define R8 72 +#define RAX 80 +#define RCX 88 +#define RDX 96 +#define RSI 104 +#define RDI 112 +#define ORIG_RAX 120 /* + error_code */ /* end of arguments */ /* cpu exception frame or undefined in case of fast syscall: */ -#define RIP (128) -#define CS (136) -#define EFLAGS (144) -#define RSP (152) -#define SS (160) +#define RIP 128 +#define CS 136 +#define EFLAGS 144 +#define RSP 152 +#define SS 160 #define ARGOFFSET R11 #define SWFRAME ORIG_RAX diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index 848850fd7d62..5f5bb0f97361 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "checksum_32.h" +# include <asm/checksum_32.h> #else -# include "checksum_64.h" +# include <asm/checksum_64.h> #endif diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99480e55973d..8d871eaddb66 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -138,9 +138,9 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), "") #ifdef CONFIG_X86_32 -# include "cmpxchg_32.h" +# include <asm/cmpxchg_32.h> #else -# include "cmpxchg_64.h" +# include <asm/cmpxchg_64.h> #endif #ifdef __HAVE_ARCH_CMPXCHG diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6b7ee5ff6820..8c297aa53eef 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -4,7 +4,9 @@ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H +#ifndef _ASM_X86_REQUIRED_FEATURES_H #include <asm/required-features.h> +#endif #define NCAPINTS 10 /* N 32-bit words worth of info */ @@ -97,6 +99,7 @@ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -209,6 +212,7 @@ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -299,12 +303,14 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) #define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) +#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 75f4c6d6a331..831dbb9c6c02 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -12,6 +12,7 @@ #include <linux/kernel_stat.h> #include <linux/regset.h> +#include <linux/compat.h> #include <linux/slab.h> #include <asm/asm.h> #include <asm/cpufeature.h> @@ -20,43 +21,76 @@ #include <asm/user.h> #include <asm/uaccess.h> #include <asm/xsave.h> +#include <asm/smap.h> -extern unsigned int sig_xstate_size; +#ifdef CONFIG_X86_64 +# include <asm/sigcontext32.h> +# include <asm/user32.h> +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern unsigned int mxcsr_feature_mask; extern void fpu_init(void); +extern void eager_fpu_init(void); DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, xstateregs_set; - /* * xstateregs_active == fpregs_active. Please refer to the comment * at the definition of fpregs_active. */ #define xstateregs_active fpregs_active -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif - #ifdef CONFIG_MATH_EMULATION +# define HAVE_HWFP (boot_cpu_data.hard_math) extern void finit_soft_fpu(struct i387_soft_struct *soft); #else +# define HAVE_HWFP 1 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif +static inline int is_ia32_compat_frame(void) +{ + return config_enabled(CONFIG_IA32_EMULATION) && + test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ + return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ + return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} + #define X87_FSW_ES (1 << 7) /* Exception Summary */ +static __always_inline __pure bool use_eager_fpu(void) +{ + return static_cpu_has(X86_FEATURE_EAGER_FPU); +} + static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -72,6 +106,13 @@ static __always_inline __pure bool use_fxsr(void) return static_cpu_has(X86_FEATURE_FXSR); } +static inline void fx_finit(struct i387_fxsave_struct *fx) +{ + memset(fx, 0, xstate_size); + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} + extern void __sanitize_i387_state(struct task_struct *); static inline void sanitize_i387_state(struct task_struct *tsk) @@ -81,131 +122,121 @@ static inline void sanitize_i387_state(struct task_struct *tsk) __sanitize_i387_state(tsk); } -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - int err; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxrstorq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "m" (*fx), "0" (0)); -#else - asm volatile("1: rex64/fxrstor (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif - return err; +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile(ASM_STAC "\n" \ + "1:" #insn "\n\t" \ + "2: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define check_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +static inline int fsave_user(struct i387_fsave_struct __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { - int err; + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - /* - * Clear the bytes not touched by the fxsave and reserved - * for the SW usage. - */ - err = __clear_user(&fx->sw_reserved, - sizeof(struct _fpx_sw_bytes)); - if (unlikely(err)) - return -EFAULT; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), [fx] "=m" (*fx) - : "0" (0)); -#else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=m" (*fx) - : [fx] "R" (fx), "0" (0)); -#endif - if (unlikely(err) && - __clear_user(fx, sizeof(struct i387_fxsave_struct))) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ - return err; + /* See comment in fpu_fxsave() below. */ + return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } -static inline void fpu_fxsave(struct fpu *fpu) +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - /* Using "rex64; fxsave %0" is broken because, if the memory operand - uses any extended registers for addressing, a second REX prefix - will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ + if (config_enabled(CONFIG_X86_32)) + return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -#ifdef CONFIG_AS_FXSAVEQ - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#else - /* Using, as a workaround, the properly prefixed form below isn't - accepted by any binutils version so far released, complaining that - the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). - asm volatile("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); - This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - asm volatile("rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); -#endif + /* See comment in fpu_fxsave() below. */ + return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); } -#else /* CONFIG_X86_32 */ +static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) + /* See comment in fpu_fxsave() below. */ + return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline int frstor_checking(struct i387_fsave_struct *fx) { - /* - * The "nop" is needed to make the instructions the same - * length. - */ - alternative_input( - "nop ; frstor %1", - "fxrstor %1", - X86_FEATURE_FXSR, - "m" (*fx)); + return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} - return 0; +static inline int frstor_user(struct i387_fsave_struct __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fpu_fxsave(struct fpu *fpu) { - asm volatile("fxsave %[fx]" - : [fx] "=m" (fpu->state->fxsave)); + if (config_enabled(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); + else { + /* Using "rex64; fxsave %0" is broken because, if the memory + * operand uses any extended registers for addressing, a second + * REX prefix will be generated (to the assembler, rex64 + * followed by semicolon is a separate instruction), and hence + * the 64-bitness is lost. + * + * Using "fxsaveq %0" would be the ideal choice, but is only + * supported starting with gas 2.16. + * + * Using, as a workaround, the properly prefixed form below + * isn't accepted by any binutils version so far released, + * complaining that the same type of prefix is used twice if + * an extended register is needed for addressing (fix submitted + * to mainline 2005-11-21). + * + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); + * + * This, however, we can work around by forcing the compiler to + * select an addressing mode that doesn't require extended + * registers. + */ + asm volatile( "rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); + } } -#endif /* CONFIG_X86_64 */ - /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact. @@ -248,17 +279,14 @@ static inline int __save_init_fpu(struct task_struct *tsk) return fpu_save_init(&tsk->thread.fpu); } -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ - return fxrstor_checking(&fpu->state->fxsave); -} - static inline int fpu_restore_checking(struct fpu *fpu) { if (use_xsave()) - return fpu_xrstor_checking(fpu); + return fpu_xrstor_checking(&fpu->state->xsave); + else if (use_fxsr()) + return fxrstor_checking(&fpu->state->fxsave); else - return fpu_fxrstor_checking(fpu); + return frstor_checking(&fpu->state->fsave); } static inline int restore_fpu_checking(struct task_struct *tsk) @@ -310,15 +338,52 @@ static inline void __thread_set_has_fpu(struct task_struct *tsk) static inline void __thread_fpu_end(struct task_struct *tsk) { __thread_clear_has_fpu(tsk); - stts(); + if (!use_eager_fpu()) + stts(); } static inline void __thread_fpu_begin(struct task_struct *tsk) { - clts(); + if (!use_eager_fpu()) + clts(); __thread_set_has_fpu(tsk); } +static inline void __drop_fpu(struct task_struct *tsk) +{ + if (__thread_has_fpu(tsk)) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(tsk); + } +} + +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + preempt_disable(); + tsk->fpu_counter = 0; + __drop_fpu(tsk); + clear_used_math(); + preempt_enable(); +} + +static inline void drop_init_fpu(struct task_struct *tsk) +{ + if (!use_eager_fpu()) + drop_fpu(tsk); + else { + if (use_xsave()) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); + } +} + /* * FPU state switching for scheduling. * @@ -352,7 +417,12 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta { fpu_switch_t fpu; - fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + fpu.preload = tsk_used_math(new) && (use_eager_fpu() || + new->fpu_counter > 5); if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) cpu = ~0; @@ -364,14 +434,14 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); - } else + } else if (!use_eager_fpu()) stts(); } else { old->fpu_counter = 0; old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new, cpu)) + if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -391,44 +461,40 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { if (fpu.preload) { if (unlikely(restore_fpu_checking(new))) - __thread_fpu_end(new); + drop_init_fpu(new); } } /* * Signal frame handlers... */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +extern int save_xstate_sig(void __user *buf, void __user *fx, int size); +extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); -static inline void __clear_fpu(struct task_struct *tsk) +static inline int xstate_sigframe_size(void) { - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +static inline int restore_xstate_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct i387_fsave_struct); + size += sizeof(struct i387_fsave_struct); } + + return __restore_xstate_sig(buf, buf_fx, size); } /* - * The actual user_fpu_begin/end() functions - * need to be preemption-safe. + * Need to be preemption-safe. * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. + * NOTE! user_fpu_begin() must be used only immediately before restoring + * it. This function does not do any save/restore on their own. */ -static inline void user_fpu_end(void) -{ - preempt_disable(); - __thread_fpu_end(current); - preempt_enable(); -} - static inline void user_fpu_begin(void) { preempt_disable(); @@ -437,25 +503,32 @@ static inline void user_fpu_begin(void) preempt_enable(); } +static inline void __save_fpu(struct task_struct *tsk) +{ + if (use_xsave()) + xsave_state(&tsk->thread.fpu.state->xsave, -1); + else + fpu_fxsave(&tsk->thread.fpu); +} + /* * These disable preemption on their own and are safe */ static inline void save_init_fpu(struct task_struct *tsk) { WARN_ON_ONCE(!__thread_has_fpu(tsk)); + + if (use_eager_fpu()) { + __save_fpu(tsk); + return; + } + preempt_disable(); __save_init_fpu(tsk); __thread_fpu_end(tsk); preempt_enable(); } -static inline void clear_fpu(struct task_struct *tsk) -{ - preempt_disable(); - __clear_fpu(tsk); - preempt_enable(); -} - /* * i387 state interaction */ @@ -510,11 +583,34 @@ static inline void fpu_free(struct fpu *fpu) } } -static inline void fpu_copy(struct fpu *dst, struct fpu *src) +static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) { - memcpy(dst->state, src->state, xstate_size); + if (use_eager_fpu()) { + memset(&dst->thread.fpu.state->xsave, 0, xstate_size); + __save_fpu(dst); + } else { + struct fpu *dfpu = &dst->thread.fpu; + struct fpu *sfpu = &src->thread.fpu; + + unlazy_fpu(src); + memcpy(dfpu->state, sfpu->state, xstate_size); + } } -extern void fpu_finit(struct fpu *fpu); +static inline unsigned long +alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, + unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct i387_fsave_struct); + sp -= sizeof(struct i387_fsave_struct); + } + + *size = frame_size; + return sp; +} #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b0767bc08740..9a25b522d377 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,38 +3,54 @@ #ifdef __ASSEMBLY__ - .macro MCOUNT_SAVE_FRAME - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + /* skip is set if the stack was already partially adjusted */ + .macro MCOUNT_SAVE_FRAME skip=0 + /* + * We add enough stack to save all regs. + */ + subq $(SS+8-\skip), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* Move RIP to its proper location */ + movq SS+8(%rsp), %rdx + movq %rdx, RIP(%rsp) .endm - .macro MCOUNT_RESTORE_FRAME - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + .macro MCOUNT_RESTORE_FRAME skip=0 + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + addq $(SS+8-\skip), %rsp .endm #endif #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(mcount)) +#ifdef CC_USING_FENTRY +# define MCOUNT_ADDR ((long)(__fentry__)) +#else +# define MCOUNT_ADDR ((long)(mcount)) +#endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +#ifdef CONFIG_DYNAMIC_FTRACE +#define ARCH_SUPPORTS_FTRACE_OPS 1 +#define ARCH_SUPPORTS_FTRACE_SAVE_REGS +#endif + #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; +extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 71ecbcba1a4e..f373046e63ec 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,10 +9,13 @@ #include <asm/asm.h> #include <asm/errno.h> #include <asm/processor.h> +#include <asm/smap.h> #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ - asm volatile("1:\t" insn "\n" \ - "2:\t.section .fixup,\"ax\"\n" \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" insn "\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup,\"ax\"\n" \ "3:\tmov\t%3, %1\n" \ "\tjmp\t2b\n" \ "\t.previous\n" \ @@ -21,12 +24,14 @@ : "i" (-EFAULT), "0" (oparg), "1" (0)) #define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ - asm volatile("1:\tmovl %2, %0\n" \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\tmovl %2, %0\n" \ "\tmovl\t%0, %3\n" \ "\t" insn "\n" \ "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ "\tjnz\t1b\n" \ - "3:\t.section .fixup,\"ax\"\n" \ + "3:\t" ASM_CLAC "\n" \ + "\t.section .fixup,\"ax\"\n" \ "4:\tmov\t%5, %1\n" \ "\tjmp\t3b\n" \ "\t.previous\n" \ @@ -122,8 +127,10 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" - "2:\t.section .fixup, \"ax\"\n" + asm volatile("\t" ASM_STAC "\n" + "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" + "2:\t" ASM_CLAC "\n" + "\t.section .fixup, \"ax\"\n" "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index d3895dbf4ddb..81f04cee5f74 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,10 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; + /* + * irq_tlb_count is double-counted in irq_call_count, so it must be + * subtracted from irq_call_count when displaying irq_call_count + */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 2c392d663dce..434e2106cc87 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -35,8 +35,6 @@ #define HPET_ID_NUMBER_SHIFT 8 #define HPET_ID_VENDOR_SHIFT 16 -#define HPET_ID_VENDOR_8086 0x8086 - #define HPET_CFG_ENABLE 0x001 #define HPET_CFG_LEGACY 0x002 #define HPET_LEGACY_8254 2 diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 257d9cca214f..ed8089d69094 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -19,12 +19,37 @@ struct pt_regs; struct user_i387_struct; extern int init_fpu(struct task_struct *child); +extern void fpu_finit(struct fpu *fpu); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); extern bool irq_fpu_usable(void); -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); + +static inline void kernel_fpu_begin(void) +{ + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + __kernel_fpu_begin(); +} + +static inline void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} /* * Some instructions like VIA's padlock instructions generate a spurious diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f229b13a5f30..f42a04735a0a 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -48,7 +48,7 @@ struct iommu_table_entry { #define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ - static const struct iommu_table_entry const \ + static const struct iommu_table_entry \ __iommu_entry_##_detect __used \ __attribute__ ((unused, __section__(".iommu_table"), \ aligned((sizeof(void *))))) \ @@ -63,10 +63,10 @@ struct iommu_table_entry { * to stop detecting the other IOMMUs after yours has been detected. */ #define IOMMU_INIT_POST(_detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0) #define IOMMU_INIT_POST_FINISH(detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1) /* * A more sophisticated version of IOMMU_INIT. This variant requires: diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 547882539157..d3ddd17405d0 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -27,6 +27,7 @@ #include <asm/insn.h> #define __ARCH_WANT_KPROBES_INSN_SLOT +#define ARCH_SUPPORTS_KPROBES_ON_FTRACE struct pt_regs; struct kprobe; diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 246617efd67f..41e08cb6a092 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,22 @@ #include <linux/types.h> #include <linux/ioctl.h> +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 + /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 09155d64cf7e..1eaa6b056670 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -75,22 +75,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define DE_VECTOR 0 -#define DB_VECTOR 1 -#define BP_VECTOR 3 -#define OF_VECTOR 4 -#define BR_VECTOR 5 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 -#define MF_VECTOR 16 -#define MC_VECTOR 18 - #define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_RPL_MASK 0x03 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a3ac52b29cbf..54d73b1f00a0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,19 +116,9 @@ struct mce_log { /* Software defined banks */ #define MCE_EXTENDED_BANK 128 #define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 - -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ -#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9) -#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9) -#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9) -#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9) -#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9) -#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) -#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) - +#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) #ifdef __KERNEL__ - extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -171,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device); #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; extern int mce_ignore_ce; +extern int mce_bios_cmci_threshold; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 4ebe157bf73d..43d921b4752c 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -15,8 +15,8 @@ struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - enum ucode_state (*request_microcode_fw) (int cpu, - struct device *device); + enum ucode_state (*request_microcode_fw) (int cpu, struct device *, + bool refresh_fw); void (*microcode_fini_cpu) (int cpu); @@ -49,12 +49,6 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); extern void __exit exit_amd_microcode(void); - -static inline void get_ucode_data(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); -} - #else static inline struct microcode_ops * __init init_amd_microcode(void) { diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h index 64217ea16a36..d497bc425cae 100644 --- a/arch/x86/include/asm/mmzone.h +++ b/arch/x86/include/asm/mmzone.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "mmzone_32.h" +# include <asm/mmzone_32.h> #else -# include "mmzone_64.h" +# include <asm/mmzone_64.h> #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 957ec87385af..fbee9714d9ab 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -248,6 +248,9 @@ #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h index a731b9c573a6..7d3a48275394 100644 --- a/arch/x86/include/asm/mutex.h +++ b/arch/x86/include/asm/mutex.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "mutex_32.h" +# include <asm/mutex_32.h> #else -# include "mutex_64.h" +# include <asm/mutex_64.h> #endif diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bfacd2ccf651..49119fcea2dc 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -53,9 +53,9 @@ static inline int numa_cpu_node(int cpu) #endif /* CONFIG_NUMA */ #ifdef CONFIG_X86_32 -# include "numa_32.h" +# include <asm/numa_32.h> #else -# include "numa_64.h" +# include <asm/numa_64.h> #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index df75d07571ce..6e41b9343928 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -141,7 +141,7 @@ void default_restore_msi_irqs(struct pci_dev *dev, int irq); #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 -#include "pci_64.h" +#include <asm/pci_64.h> #endif /* implement the pci_ DMA API in terms of the generic device dma_ one */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cb4e43bce98a..4fabcdf1cfa7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -262,4 +262,6 @@ static inline void perf_check_microcode(void) { } static inline void amd_pmu_disable_virt(void) { } #endif +#define arch_perf_out_copy_user copy_from_user_nmi + #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/perf_regs.h b/arch/x86/include/asm/perf_regs.h new file mode 100644 index 000000000000..3f2207bfd17b --- /dev/null +++ b/arch/x86/include/asm/perf_regs.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_PERF_REGS_H +#define _ASM_X86_PERF_REGS_H + +enum perf_event_x86_regs { + PERF_REG_X86_AX, + PERF_REG_X86_BX, + PERF_REG_X86_CX, + PERF_REG_X86_DX, + PERF_REG_X86_SI, + PERF_REG_X86_DI, + PERF_REG_X86_BP, + PERF_REG_X86_SP, + PERF_REG_X86_IP, + PERF_REG_X86_FLAGS, + PERF_REG_X86_CS, + PERF_REG_X86_SS, + PERF_REG_X86_DS, + PERF_REG_X86_ES, + PERF_REG_X86_FS, + PERF_REG_X86_GS, + PERF_REG_X86_R8, + PERF_REG_X86_R9, + PERF_REG_X86_R10, + PERF_REG_X86_R11, + PERF_REG_X86_R12, + PERF_REG_X86_R13, + PERF_REG_X86_R14, + PERF_REG_X86_R15, + + PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, + PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, +}; +#endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 49afb3f41eb6..fc9948465293 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -384,9 +384,9 @@ pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 -# include "pgtable_32.h" +# include <asm/pgtable_32.h> #else -# include "pgtable_64.h" +# include <asm/pgtable_64.h> #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2c..ec8a1fc9505d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -174,9 +174,9 @@ #endif #ifdef CONFIG_X86_32 -# include "pgtable_32_types.h" +# include <asm/pgtable_32_types.h> #else -# include "pgtable_64_types.h" +# include <asm/pgtable_64_types.h> #endif #ifndef __ASSEMBLY__ @@ -303,11 +303,9 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); +extern void native_pagetable_init(void); #else -#define native_pagetable_setup_start x86_init_pgd_noop -#define native_pagetable_setup_done x86_init_pgd_noop +#define native_pagetable_init paging_init #endif struct seq_file; diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h index 7ef7c3020e5c..bad3665c25fc 100644 --- a/arch/x86/include/asm/posix_types.h +++ b/arch/x86/include/asm/posix_types.h @@ -1,15 +1,15 @@ #ifdef __KERNEL__ # ifdef CONFIG_X86_32 -# include "posix_types_32.h" +# include <asm/posix_types_32.h> # else -# include "posix_types_64.h" +# include <asm/posix_types_64.h> # endif #else # ifdef __i386__ -# include "posix_types_32.h" +# include <asm/posix_types_32.h> # elif defined(__ILP32__) -# include "posix_types_x32.h" +# include <asm/posix_types_x32.h> # else -# include "posix_types_64.h" +# include <asm/posix_types_64.h> # endif #endif diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index aea1d1d848c7..680cf09ed100 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -65,6 +65,7 @@ #define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ +#define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ /* * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d048cad9bcad..b98c0d958ebb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -423,7 +423,6 @@ DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); -extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR @@ -759,6 +758,8 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +extern void set_task_blockstep(struct task_struct *task, bool on); + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h new file mode 100644 index 000000000000..d1ac07a23979 --- /dev/null +++ b/arch/x86/include/asm/rcu.h @@ -0,0 +1,32 @@ +#ifndef _ASM_X86_RCU_H +#define _ASM_X86_RCU_H + +#ifndef __ASSEMBLY__ + +#include <linux/rcupdate.h> +#include <asm/ptrace.h> + +static inline void exception_enter(struct pt_regs *regs) +{ + rcu_user_exit(); +} + +static inline void exception_exit(struct pt_regs *regs) +{ +#ifdef CONFIG_RCU_USER_QS + if (user_mode(regs)) + rcu_user_enter(); +#endif +} + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_RCU_USER_QS +# define SCHEDULE_USER call schedule_user +#else +# define SCHEDULE_USER call schedule +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index c62e58a5a90d..0f3d7f099224 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "seccomp_32.h" +# include <asm/seccomp_32.h> #else -# include "seccomp_64.h" +# include <asm/seccomp_64.h> #endif diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 598457cbd0f8..323973f4abf1 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -31,6 +31,10 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#ifndef CONFIG_COMPAT +typedef sigset_t compat_sigset_t; +#endif + #else /* Here we must cater to libcs that poke about in kernel headers. */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h new file mode 100644 index 000000000000..8d3120f4e270 --- /dev/null +++ b/arch/x86/include/asm/smap.h @@ -0,0 +1,91 @@ +/* + * Supervisor Mode Access Prevention support + * + * Copyright (C) 2012 Intel Corporation + * Author: H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifndef _ASM_X86_SMAP_H +#define _ASM_X86_SMAP_H + +#include <linux/stringify.h> +#include <asm/nops.h> +#include <asm/cpufeature.h> + +/* "Raw" instruction opcodes */ +#define __ASM_CLAC .byte 0x0f,0x01,0xca +#define __ASM_STAC .byte 0x0f,0x01,0xcb + +#ifdef __ASSEMBLY__ + +#include <asm/alternative-asm.h> + +#ifdef CONFIG_X86_SMAP + +#define ASM_CLAC \ + 661: ASM_NOP3 ; \ + .pushsection .altinstr_replacement, "ax" ; \ + 662: __ASM_CLAC ; \ + .popsection ; \ + .pushsection .altinstructions, "a" ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + .popsection + +#define ASM_STAC \ + 661: ASM_NOP3 ; \ + .pushsection .altinstr_replacement, "ax" ; \ + 662: __ASM_STAC ; \ + .popsection ; \ + .pushsection .altinstructions, "a" ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + .popsection + +#else /* CONFIG_X86_SMAP */ + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#else /* __ASSEMBLY__ */ + +#include <asm/alternative.h> + +#ifdef CONFIG_X86_SMAP + +static __always_inline void clac(void) +{ + /* Note: a barrier is implicit in alternative() */ + alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); +} + +static __always_inline void stac(void) +{ + /* Note: a barrier is implicit in alternative() */ + alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); +} + +/* These macros can be used in asm() statements */ +#define ASM_CLAC \ + ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) +#define ASM_STAC \ + ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + +#else /* CONFIG_X86_SMAP */ + +static inline void clac(void) { } +static inline void stac(void) { } + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h index 6dfd6d9373a0..09224d7a5862 100644 --- a/arch/x86/include/asm/string.h +++ b/arch/x86/include/asm/string.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "string_32.h" +# include <asm/string_32.h> #else -# include "string_64.h" +# include <asm/string_64.h> #endif diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index 9bd521fe4570..2fab6c2c3575 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "suspend_32.h" +# include <asm/suspend_32.h> #else -# include "suspend_64.h" +# include <asm/suspend_64.h> #endif diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f2b83bc7d784..cdf5674dd23a 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -1,6 +1,135 @@ #ifndef __SVM_H #define __SVM_H +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + +#ifdef __KERNEL__ + enum { INTERCEPT_INTR, INTERCEPT_NMI, @@ -264,81 +393,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXITINFO_REG_MASK 0x0F -#define SVM_EXIT_READ_CR0 0x000 -#define SVM_EXIT_READ_CR3 0x003 -#define SVM_EXIT_READ_CR4 0x004 -#define SVM_EXIT_READ_CR8 0x008 -#define SVM_EXIT_WRITE_CR0 0x010 -#define SVM_EXIT_WRITE_CR3 0x013 -#define SVM_EXIT_WRITE_CR4 0x014 -#define SVM_EXIT_WRITE_CR8 0x018 -#define SVM_EXIT_READ_DR0 0x020 -#define SVM_EXIT_READ_DR1 0x021 -#define SVM_EXIT_READ_DR2 0x022 -#define SVM_EXIT_READ_DR3 0x023 -#define SVM_EXIT_READ_DR4 0x024 -#define SVM_EXIT_READ_DR5 0x025 -#define SVM_EXIT_READ_DR6 0x026 -#define SVM_EXIT_READ_DR7 0x027 -#define SVM_EXIT_WRITE_DR0 0x030 -#define SVM_EXIT_WRITE_DR1 0x031 -#define SVM_EXIT_WRITE_DR2 0x032 -#define SVM_EXIT_WRITE_DR3 0x033 -#define SVM_EXIT_WRITE_DR4 0x034 -#define SVM_EXIT_WRITE_DR5 0x035 -#define SVM_EXIT_WRITE_DR6 0x036 -#define SVM_EXIT_WRITE_DR7 0x037 -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_INTR 0x060 -#define SVM_EXIT_NMI 0x061 -#define SVM_EXIT_SMI 0x062 -#define SVM_EXIT_INIT 0x063 -#define SVM_EXIT_VINTR 0x064 -#define SVM_EXIT_CR0_SEL_WRITE 0x065 -#define SVM_EXIT_IDTR_READ 0x066 -#define SVM_EXIT_GDTR_READ 0x067 -#define SVM_EXIT_LDTR_READ 0x068 -#define SVM_EXIT_TR_READ 0x069 -#define SVM_EXIT_IDTR_WRITE 0x06a -#define SVM_EXIT_GDTR_WRITE 0x06b -#define SVM_EXIT_LDTR_WRITE 0x06c -#define SVM_EXIT_TR_WRITE 0x06d -#define SVM_EXIT_RDTSC 0x06e -#define SVM_EXIT_RDPMC 0x06f -#define SVM_EXIT_PUSHF 0x070 -#define SVM_EXIT_POPF 0x071 -#define SVM_EXIT_CPUID 0x072 -#define SVM_EXIT_RSM 0x073 -#define SVM_EXIT_IRET 0x074 -#define SVM_EXIT_SWINT 0x075 -#define SVM_EXIT_INVD 0x076 -#define SVM_EXIT_PAUSE 0x077 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_INVLPG 0x079 -#define SVM_EXIT_INVLPGA 0x07a -#define SVM_EXIT_IOIO 0x07b -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_TASK_SWITCH 0x07d -#define SVM_EXIT_FERR_FREEZE 0x07e -#define SVM_EXIT_SHUTDOWN 0x07f -#define SVM_EXIT_VMRUN 0x080 -#define SVM_EXIT_VMMCALL 0x081 -#define SVM_EXIT_VMLOAD 0x082 -#define SVM_EXIT_VMSAVE 0x083 -#define SVM_EXIT_STGI 0x084 -#define SVM_EXIT_CLGI 0x085 -#define SVM_EXIT_SKINIT 0x086 -#define SVM_EXIT_RDTSCP 0x087 -#define SVM_EXIT_ICEBP 0x088 -#define SVM_EXIT_WBINVD 0x089 -#define SVM_EXIT_MONITOR 0x08a -#define SVM_EXIT_MWAIT 0x08b -#define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_XSETBV 0x08d -#define SVM_EXIT_NPF 0x400 - -#define SVM_EXIT_ERR -1 - #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" @@ -350,3 +404,4 @@ struct __attribute__ ((__packed__)) vmcb { #endif +#endif diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3fda9db48819..4ca1c611b552 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -40,7 +40,7 @@ asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, struct old_sigaction32 __user *); asmlinkage long sys32_alarm(unsigned int); -asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); +asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int); asmlinkage long sys32_sysfs(int, u32, u32); asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89f794f007ec..c535d847e3b5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,6 +89,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ +#define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -114,6 +115,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) +#define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) @@ -126,12 +128,13 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -141,7 +144,8 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ - ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e1f3a17034fc..7ccf8d131535 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -9,6 +9,7 @@ #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> +#include <asm/smap.h> #define VERIFY_READ 0 #define VERIFY_WRITE 1 @@ -192,9 +193,10 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile("1: movl %%eax,0(%2)\n" \ + asm volatile(ASM_STAC "\n" \ + "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3:\n" \ + "3: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -205,9 +207,10 @@ extern int __get_user_bad(void); : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile("1: movl %%eax,0(%1)\n" \ + asm volatile(ASM_STAC "\n" \ + "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3:\n" \ + "3: " ASM_CLAC "\n" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -379,8 +382,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile("1: mov"itype" %2,%"rtype"1\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %2,%"rtype"1\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -443,8 +447,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile("1: mov"itype" %"rtype"1,%2\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %"rtype"1,%2\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -463,13 +468,13 @@ struct __large_struct { unsigned long buf[100]; }; * uaccess_try and catch */ #define uaccess_try do { \ - int prev_err = current_thread_info()->uaccess_err; \ current_thread_info()->uaccess_err = 0; \ + stac(); \ barrier(); #define uaccess_catch(err) \ + clac(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ - current_thread_info()->uaccess_err = prev_err; \ } while (0) /** @@ -569,6 +574,9 @@ strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ @@ -581,9 +589,9 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 -# include "uaccess_32.h" +# include <asm/uaccess_32.h> #else -# include "uaccess_64.h" +# include <asm/uaccess_64.h> #endif #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 576e39bca6ad..7f760a9f1f61 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,7 +213,4 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); - #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d8def8b3dba0..142810c457dc 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -217,9 +217,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check unsigned long clear_user(void __user *mem, unsigned long len); -__must_check unsigned long __clear_user(void __user *mem, unsigned long len); - static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index f3971bbcd1de..8ff8be7835ab 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -42,10 +42,11 @@ struct arch_uprobe { }; struct arch_uprobe_task { - unsigned long saved_trap_nr; #ifdef CONFIG_X86_64 unsigned long saved_scratch_register; #endif + unsigned int saved_trap_nr; + unsigned int saved_tf; }; extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 24532c7da3d6..ccab4af1646d 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -2,9 +2,9 @@ #define _ASM_X86_USER_H #ifdef CONFIG_X86_32 -# include "user_32.h" +# include <asm/user_32.h> #else -# include "user_64.h" +# include <asm/user_64.h> #endif #include <asm/types.h> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bb0522850b74..fddb53d63915 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -11,7 +11,8 @@ extern const char VDSO32_PRELINK[]; #define VDSO32_SYMBOL(base, name) \ ({ \ extern const char VDSO32_##name[]; \ - (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \ + (void __user *)(VDSO32_##name - VDSO32_PRELINK + \ + (unsigned long)(base)); \ }) #endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 74fcb963595b..36ec21c36d68 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,88 @@ * */ +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 + +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" } + +#ifdef __KERNEL__ + #include <linux/types.h> /* @@ -241,49 +323,6 @@ enum vmcs_field { HOST_RIP = 0x00006c16, }; -#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 - -#define EXIT_REASON_PENDING_INTERRUPT 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION 40 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 -#define EXIT_REASON_INVPCID 58 - /* * Interruption-information format */ @@ -488,3 +527,5 @@ enum vm_instruction_error_number { }; #endif + +#endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 38155f667144..57693498519c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,12 +81,13 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions - * @pagetable_setup_start: platform specific pre paging_init() call - * @pagetable_setup_done: platform specific post paging_init() call + * @pagetable_init: platform specific paging initialization call to setup + * the kernel pagetables and prepare accessors functions. + * Callback must call paging_init(). Called once after the + * direct mapping for phys memory is available. */ struct x86_init_paging { - void (*pagetable_setup_start)(pgd_t *base); - void (*pagetable_setup_done)(pgd_t *base); + void (*pagetable_init)(void); }; /** diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index cbf0c9d50b92..1707cfa928fb 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -47,6 +47,10 @@ #endif #ifndef __ASSEMBLY__ +/* Explicitly size integers that represent pfns in the public interface + * with Xen so that on ARM we can have one ABI that works for 32 and 64 + * bit guests. */ +typedef unsigned long xen_pfn_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); @@ -57,6 +61,7 @@ DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); +DEFINE_GUEST_HANDLE(xen_pfn_t); #endif #ifndef HYPERVISOR_VIRT_START @@ -116,11 +121,13 @@ struct arch_shared_info { #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_32 -#include "interface_32.h" +#include <asm/xen/interface_32.h> #else -#include "interface_64.h" +#include <asm/xen/interface_64.h> #endif +#include <asm/pvclock-abi.h> + #ifndef __ASSEMBLY__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 1be1ab7d6a41..ee52fcac6f72 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -5,10 +5,12 @@ extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); extern void __init pci_xen_swiotlb_init(void); +extern int pci_xen_swiotlb_init_late(void); #else #define xen_swiotlb (0) static inline int __init pci_xen_swiotlb_detect(void) { return 0; } static inline void __init pci_xen_swiotlb_init(void) { } +static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif #endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 7fcf6f3dbcc3..f8fde90bc45e 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -3,8 +3,8 @@ # include <asm-generic/xor.h> #else #ifdef CONFIG_X86_32 -# include "xor_32.h" +# include <asm/xor_32.h> #else -# include "xor_64.h" +# include <asm/xor_64.h> #endif #endif diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 454570891bdc..f79cb7ec0e06 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = { * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) */ -#define XMMS_SAVE \ -do { \ - preempt_disable(); \ - cr0 = read_cr0(); \ - clts(); \ - asm volatile( \ - "movups %%xmm0,(%0) ;\n\t" \ - "movups %%xmm1,0x10(%0) ;\n\t" \ - "movups %%xmm2,0x20(%0) ;\n\t" \ - "movups %%xmm3,0x30(%0) ;\n\t" \ - : \ - : "r" (xmm_save) \ - : "memory"); \ -} while (0) - -#define XMMS_RESTORE \ -do { \ - asm volatile( \ - "sfence ;\n\t" \ - "movups (%0),%%xmm0 ;\n\t" \ - "movups 0x10(%0),%%xmm1 ;\n\t" \ - "movups 0x20(%0),%%xmm2 ;\n\t" \ - "movups 0x30(%0),%%xmm3 ;\n\t" \ - : \ - : "r" (xmm_save) \ - : "memory"); \ - write_cr0(cr0); \ - preempt_enable(); \ -} while (0) - -#define ALIGN16 __attribute__((aligned(16))) - #define OFFS(x) "16*("#x")" #define PF_OFFS(x) "256+16*("#x")" #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" @@ -587,10 +555,8 @@ static void xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a @@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); - XMMS_RESTORE; + kernel_fpu_end(); } static struct xor_block_template xor_block_pIII_sse = { @@ -862,7 +822,7 @@ static struct xor_block_template xor_block_pIII_sse = { }; /* Also try the AVX routines */ -#include "xor_avx.h" +#include <asm/xor_avx.h> /* Also try the generic routines. */ #include <asm-generic/xor.h> diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index b9b2323e90fe..87ac522c4af5 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -34,41 +34,7 @@ * no advantages to be gotten from x86-64 here anyways. */ -typedef struct { - unsigned long a, b; -} __attribute__((aligned(16))) xmm_store_t; - -/* Doesn't use gcc to save the XMM registers, because there is no easy way to - tell it to do a clts before the register saving. */ -#define XMMS_SAVE \ -do { \ - preempt_disable(); \ - asm volatile( \ - "movq %%cr0,%0 ;\n\t" \ - "clts ;\n\t" \ - "movups %%xmm0,(%1) ;\n\t" \ - "movups %%xmm1,0x10(%1) ;\n\t" \ - "movups %%xmm2,0x20(%1) ;\n\t" \ - "movups %%xmm3,0x30(%1) ;\n\t" \ - : "=&r" (cr0) \ - : "r" (xmm_save) \ - : "memory"); \ -} while (0) - -#define XMMS_RESTORE \ -do { \ - asm volatile( \ - "sfence ;\n\t" \ - "movups (%1),%%xmm0 ;\n\t" \ - "movups 0x10(%1),%%xmm1 ;\n\t" \ - "movups 0x20(%1),%%xmm2 ;\n\t" \ - "movups 0x30(%1),%%xmm3 ;\n\t" \ - "movq %0,%%cr0 ;\n\t" \ - : \ - : "r" (cr0), "r" (xmm_save) \ - : "memory"); \ - preempt_enable(); \ -} while (0) +#include <asm/i387.h> #define OFFS(x) "16*("#x")" #define PF_OFFS(x) "256+16*("#x")" @@ -91,10 +57,8 @@ static void xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned int lines = bytes >> 8; - unsigned long cr0; - xmm_store_t xmm_save[4]; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : [inc] "r" (256UL) : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static struct xor_block_template xor_block_sse = { @@ -349,7 +306,7 @@ static struct xor_block_template xor_block_sse = { /* Also try the AVX routines */ -#include "xor_avx.h" +#include <asm/xor_avx.h> #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 2510d35f480e..7ea79c5fa1f2 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -20,32 +20,6 @@ #include <linux/compiler.h> #include <asm/i387.h> -#define ALIGN32 __aligned(32) - -#define YMM_SAVED_REGS 4 - -#define YMMS_SAVE \ -do { \ - preempt_disable(); \ - cr0 = read_cr0(); \ - clts(); \ - asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ - asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ - asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ - asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ -} while (0); - -#define YMMS_RESTORE \ -do { \ - asm volatile("sfence" : : : "memory"); \ - asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ - asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ - asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ - asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ - write_cr0(cr0); \ - preempt_enable(); \ -} while (0); - #define BLOCK4(i) \ BLOCK(32 * i, 0) \ BLOCK(32 * (i + 1), 1) \ @@ -60,10 +34,9 @@ do { \ static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -82,16 +55,15 @@ do { \ p1 = (unsigned long *)((uintptr_t)p1 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -113,16 +85,15 @@ do { \ p2 = (unsigned long *)((uintptr_t)p2 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2, unsigned long *p3) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -147,16 +118,15 @@ do { \ p3 = (unsigned long *)((uintptr_t)p3 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -184,7 +154,7 @@ do { \ p4 = (unsigned long *)((uintptr_t)p4 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static struct xor_block_template xor_block_avx = { diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 8a1b6f9b594a..0415cdabb5a6 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -34,17 +34,14 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; +extern struct xsave_struct *init_xstate_buf; extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); -extern int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *sw); -static inline int fpu_xrstor_checking(struct fpu *fpu) +static inline int fpu_xrstor_checking(struct xsave_struct *fx) { - struct xsave_struct *fx = &fpu->state->xsave; int err; asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" @@ -69,13 +66,13 @@ static inline int xsave_user(struct xsave_struct __user *buf) * Clear the xsave header first, so that reserved fields are * initialized to zero. */ - err = __clear_user(&buf->xsave_hdr, - sizeof(struct xsave_hdr_struct)); + err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); if (unlikely(err)) return -EFAULT; - __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" - "2:\n" + __asm__ __volatile__(ASM_STAC "\n" + "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" @@ -84,9 +81,6 @@ static inline int xsave_user(struct xsave_struct __user *buf) : [err] "=r" (err) : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ return err; } @@ -97,8 +91,9 @@ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) u32 lmask = mask; u32 hmask = mask >> 32; - __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" - "2:\n" + __asm__ __volatile__(ASM_STAC "\n" + "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild new file mode 100644 index 000000000000..83b6e9a0dce4 --- /dev/null +++ b/arch/x86/include/uapi/asm/Kbuild @@ -0,0 +1,6 @@ +# UAPI Header export list +include include/uapi/asm-generic/Kbuild.asm + +genhdr-y += unistd_32.h +genhdr-y += unistd_64.h +genhdr-y += unistd_x32.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8215e5652d97..8d7a619718b5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2297e58c6ed..e651f7a589ac 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) acpi_register_lapic(physid, ACPI_MADT_ENABLED); /* - * If mp_register_lapic successfully generates a new logical cpu + * If acpi_register_lapic successfully generates a new logical cpu * number, then the following will get us exactly what was mapped */ cpumask_andnot(new_map, cpu_present_mask, tmp_map); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1b8e5a03d942..11676cf65aee 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -43,17 +43,22 @@ int acpi_suspend_lowlevel(void) header->video_mode = saved_video_mode; + header->pmode_behavior = 0; + #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, - &header->pmode_efer_high)) - header->pmode_efer_low = header->pmode_efer_high = 0; + if (!rdmsr_safe(MSR_EFER, + &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); - header->pmode_cr4 = read_cr4_safe(); - header->pmode_behavior = 0; + if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { + header->pmode_cr4 = read_cr4(); + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); + } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, &header->pmode_misc_en_low, &header->pmode_misc_en_high)) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ced4534baed5..ef5ccca79a6c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -23,19 +23,6 @@ #define MAX_PATCH_LEN (255-1) -#ifdef CONFIG_HOTPLUG_CPU -static int smp_alt_once; - -static int __init bootonly(char *str) -{ - smp_alt_once = 1; - return 1; -} -__setup("smp-alt-boot", bootonly); -#else -#define smp_alt_once 1 -#endif - static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) @@ -317,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); - }; + } mutex_unlock(&text_mutex); } @@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - if (noreplace_smp) - return; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -338,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); - }; + } mutex_unlock(&text_mutex); } @@ -359,7 +343,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); -static int smp_mode = 1; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by smp_alt */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - if (noreplace_smp) - return; + mutex_lock(&smp_alt); + if (!uniproc_patched) + goto unlock; - if (smp_alt_once) { - if (boot_cpu_has(X86_FEATURE_UP)) - alternatives_smp_unlock(locks, locks_end, - text, text_end); - return; - } + if (num_possible_cpus() == 1) + /* Don't bother remembering, we'll never have to undo it. */ + goto smp_unlock; smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (NULL == smp) - return; /* we'll run the (safe but slow) SMP code then ... */ + /* we'll run the (safe but slow) SMP code then ... */ + goto unlock; smp->mod = mod; smp->name = name; @@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); - if (boot_cpu_has(X86_FEATURE_UP)) - alternatives_smp_unlock(smp->locks, smp->locks_end, - smp->text, smp->text_end); +smp_unlock: + alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock: mutex_unlock(&smp_alt); } @@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - if (smp_alt_once || noreplace_smp) - return; - mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - mutex_unlock(&smp_alt); - DPRINTK("%s: %s\n", __func__, item->name); kfree(item); - return; + break; } mutex_unlock(&smp_alt); } -bool skip_smp_alternatives; -void alternatives_smp_switch(int smp) +void alternatives_enable_smp(void) { struct smp_alt_module *mod; @@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp) pr_info("lockdep: fixing up alternatives\n"); #endif - if (noreplace_smp || smp_alt_once || skip_smp_alternatives) - return; - BUG_ON(!smp && (num_online_cpus() > 1)); + /* Why bother if there are no other CPUs? */ + BUG_ON(num_possible_cpus() == 1); mutex_lock(&smp_alt); - /* - * Avoid unnecessary switches because it forces JIT based VMs to - * throw away all cached translations, which can be quite costly. - */ - if (smp == smp_mode) { - /* nothing */ - } else if (smp) { + if (uniproc_patched) { pr_info("switching to SMP code\n"); + BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); - } else { - pr_info("switching to UP code\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); - set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); - list_for_each_entry(mod, &smp_alt_modules, next) - alternatives_smp_unlock(mod->locks, mod->locks_end, - mod->text, mod->text_end); + uniproc_patched = false; } - smp_mode = smp; mutex_unlock(&smp_alt); } @@ -540,40 +503,22 @@ void __init alternative_instructions(void) apply_alternatives(__alt_instructions, __alt_instructions_end); - /* switch to patch-once-at-boottime-only mode and free the - * tables in case we know the number of CPUs will never ever - * change */ -#ifdef CONFIG_HOTPLUG_CPU - if (num_possible_cpus() < 2) - smp_alt_once = 1; -#endif - #ifdef CONFIG_SMP - if (smp_alt_once) { - if (1 == num_possible_cpus()) { - pr_info("switching to UP code\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); - set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); - - alternatives_smp_unlock(__smp_locks, __smp_locks_end, - _text, _etext); - } - } else { + /* Patch to UP if other cpus not imminent. */ + if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { + uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - - /* Only switch to UP mode if we don't immediately boot others */ - if (num_present_cpus() == 1 || setup_max_cpus <= 1) - alternatives_smp_switch(0); } -#endif - apply_paravirt(__parainstructions, __parainstructions_end); - if (smp_alt_once) + if (!uniproc_patched || num_possible_cpus() == 1) free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); +#endif + + apply_paravirt(__parainstructions, __parainstructions_end); restart_nmi(); } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb3082328..b17416e72fbd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); i++; v1 >>= 1; - }; + } apic_printk(APIC_DEBUG, KERN_CONT "\n"); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9d92e19039f0..f7e98a2c0d12 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif +static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ + if (!cpu_has_invlpg) + return; + + tlb_flushall_shift = 5; + + if (c->x86 <= 0x11) + tlb_flushall_shift = 4; +} + +static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) { + u32 a, b, c, d; + + cpuid(0x80000005, &a, &b, &c, &d); + tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; + } else { + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + } + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + + cpu_set_tlb_flushall_shift(c); +} + static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, @@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c97bb7b5a9f8..d0e910da16c5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -165,10 +165,15 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); #endif check_config(); - check_fpu(); check_hlt(); check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); + + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + check_fpu(); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c5fccc..7505f7b13e71 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -259,23 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static int disable_smep __cpuinitdata; static __init int setup_disable_smep(char *arg) { - disable_smep = 1; + setup_clear_cpu_cap(X86_FEATURE_SMEP); return 1; } __setup("nosmep", setup_disable_smep); -static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +static __always_inline void setup_smep(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_SMEP)) { - if (unlikely(disable_smep)) { - setup_clear_cpu_cap(X86_FEATURE_SMEP); - clear_in_cr4(X86_CR4_SMEP); - } else - set_in_cr4(X86_CR4_SMEP); - } + if (cpu_has(c, X86_FEATURE_SMEP)) + set_in_cr4(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMAP); + return 1; +} +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags; + + /* This should have been cleared long ago */ + raw_local_save_flags(eflags); + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) + set_in_cr4(X86_CR4_SMAP); } /* @@ -476,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ - "tlb_flushall_shift is 0x%x\n", + "tlb_flushall_shift: %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], @@ -712,8 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); - setup_smep(c); - if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } @@ -798,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid; } - setup_smep(c); - get_model_name(c); /* Default name */ detect_nopl(c); @@ -864,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); + /* Set up SMEP/SMAP */ + setup_smep(c); + setup_smap(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -942,8 +955,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - if (boot_cpu_data.cpuid_level >= 2) - cpu_detect_tlb(&boot_cpu_data); + cpu_detect_tlb(&boot_cpu_data); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1023,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + printk(KERN_CONT "%s", strim(c->x86_model_id)); else printk(KERN_CONT "%d86", c->x86); + printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); + printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); else - printk(KERN_CONT "\n"); + printk(KERN_CONT ")\n"); print_cpu_msr(c); } @@ -1113,11 +1127,10 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC); } -unsigned long kernel_eflags; - /* * Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. @@ -1297,9 +1310,6 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); - - raw_local_save_flags(kernel_eflags); if (is_uv_system()) uv_cpu_init(); @@ -1352,6 +1362,5 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0a4ce2980a5a..198e019a531a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) int i, j, n; unsigned int regs[4]; unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb393577..ddc72f839332 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) } static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { @@ -194,7 +195,11 @@ static void raise_mce(struct mce *m) put_online_cpus(); } else #endif + { + preempt_disable(); raise_local(); + preempt_enable(); + } } /* Error injection interface */ @@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, * so do it a jiffie or two later everywhere. */ schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); raise_mce(&m); + mutex_unlock(&mce_inject_mutex); return usize; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ed44c8a65858..6a05c1d327a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6 +28,18 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +#endif + +void mce_timer_kick(unsigned long interval); + #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); ssize_t apei_read_mce(struct mce *m, u64 *record_id); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 292d0258311c..29e87d3b2843 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly; int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +int mce_bios_cmci_threshold __read_mostly; struct mce_bank *mce_banks __read_mostly; @@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); @@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data) if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* @@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) + if (mce_notify_irq()) { iv = max(iv / 2, (unsigned long) HZ/100); - else + } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} - t->expires = jiffies + iv; - add_timer_on(t, smp_processor_id()); +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); + + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = check_interval * HZ; + unsigned long iv = mce_adjust_timer(check_interval * HZ); - setup_timer(t, mce_timer_fn, smp_processor_id()); + __this_cpu_write(mce_next_interval, iv); - if (mce_ignore_ce) + if (mce_ignore_ce || !iv) return; - __this_cpu_write(mce_next_interval, iv); - if (!iv) - return; t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { @@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str) mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + mce_bios_cmci_threshold = 1; else if (isdigit(str[0])) { get_option(&str, &tolerant); if (*str == ',') { @@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { &mce_cmci_disabled }; +static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { + __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), + &mce_bios_cmci_threshold +}; + static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, @@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = { &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, + &dev_attr_bios_cmci_threshold.attr, NULL }; @@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - per_cpu(mce_next_interval, cpu)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ cmci_rediscover(cpu); - break; } + return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 38e49bc95ffc..5f88abf07e9c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8 @@ #include <asm/msr.h> #include <asm/mce.h> +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); */ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -53,6 +70,93 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_clear(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -61,33 +165,28 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; + int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; @@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { - if (test_and_clear_bit(i, owned) && !boot) - print_update("SHD", &hdr, i); + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; + if (!mce_bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { - if (!test_and_set_bit(i, owned) && !boot) - print_update("CMCI", &hdr, i); + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mce_bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); + if (mce_bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -156,7 +278,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); + val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } @@ -186,7 +308,7 @@ void cmci_rediscover(int dying) continue; /* Recheck banks in case CPUs don't all have the same */ if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } set_cpus_allowed_ptr(current, old); @@ -200,7 +322,7 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } static void intel_init_cmci(void) @@ -211,7 +333,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index c7b3fe2d72e0..091972ef49de 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -8,7 +8,10 @@ open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; -print OUT "#include <asm/cpufeature.h>\n\n"; +print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n"; +print OUT "#include <asm/cpufeature.h>\n"; +print OUT "#endif\n"; +print OUT "\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; %features = (); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 38e4894165b9..99d96a4978b5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1950,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp static struct intel_uncore_box * uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - static struct intel_uncore_box *box; + struct intel_uncore_box *box; box = *per_cpu_ptr(pmu->box, cpu); if (box) @@ -2347,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event) return ret; } +static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { + .attrs = uncore_pmu_attrs, +}; + static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -2384,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) free_percpu(type->pmus[i].box); kfree(type->pmus); type->pmus = NULL; - kfree(type->attr_groups[1]); - type->attr_groups[1] = NULL; + kfree(type->events_group); + type->events_group = NULL; } static void __init uncore_types_exit(struct intel_uncore_type **types) @@ -2437,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type) for (j = 0; j < i; j++) attrs[j] = &type->event_descs[j].attr.attr; - type->attr_groups[1] = events_group; + type->events_group = events_group; } + type->pmu_group = &uncore_pmu_attr_group; type->pmus = pmus; return 0; fail: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 5b81c1856aac..e68a4550e952 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -369,10 +369,12 @@ struct intel_uncore_type { struct intel_uncore_pmu *pmus; struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; - const struct attribute_group *attr_groups[3]; + const struct attribute_group *attr_groups[4]; }; -#define format_group attr_groups[0] +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2] struct intel_uncore_ops { void (*init_box)(struct intel_uncore_box *); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 8022c6681485..fbd895562292 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = cpumask_first(cpu_online_mask); - else - *pos = cpumask_next(*pos - 1, cpu_online_mask); + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 39472dd2323f..60c78917190c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -199,12 +199,14 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + get_online_cpus(); for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) goto out_class; } register_hotcpu_notifier(&cpuid_class_cpu_notifier); + put_online_cpus(); err = 0; goto out; @@ -214,6 +216,7 @@ out_class: for_each_online_cpu(i) { cpuid_device_destroy(i); } + put_online_cpus(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -225,11 +228,13 @@ static void __exit cpuid_exit(void) { int cpu = 0; + get_online_cpus(); for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + put_online_cpus(); } module_init(cpuid_init); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3ae2ced4a874..b1581527a236 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = { .xlate = ioapic_xlate, }; +static void dt_add_ioapic_domain(unsigned int ioapic_num, + struct device_node *np) +{ + struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + int ret; + int num; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); + num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; + + id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, + (void *)ioapic_num); + BUG_ON(!id); + if (gsi_cfg->gsi_base == 0) { + /* + * The first NR_IRQS_LEGACY irq descs are allocated in + * early_irq_init() and need just a mapping. The + * remaining irqs need both. All of them are preallocated + * and assigned so we can keep the 1:1 mapping which the ioapic + * is having. + */ + ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); + if (ret) + pr_err("Error mapping legacy IRQs: %d\n", ret); + + if (num > NR_IRQS_LEGACY) { + ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, + NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); + if (ret) + pr_err("Error creating mapping for the " + "remaining IRQs: %d\n", ret); + } + irq_set_default_host(id); + } else { + ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); + if (ret) + pr_err("Error creating IRQ mapping: %d\n", ret); + } +} + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { - struct irq_domain *id; - struct mp_ioapic_gsi *gsi_cfg; - - gsi_cfg = mp_ioapic_gsi_routing(i); - - id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, - &ioapic_irq_domain_ops, - (void*)i); - BUG_ON(!id); + dt_add_ioapic_domain(i, np); return; } } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f28837476..0750e3ba87c0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -57,6 +57,7 @@ #include <asm/cpufeature.h> #include <asm/alternative-asm.h> #include <asm/asm.h> +#include <asm/smap.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -407,7 +408,9 @@ sysenter_past_esp: */ cmpl $__PAGE_OFFSET-3,%ebp jae syscall_fault + ASM_STAC 1: movl (%ebp),%ebp + ASM_CLAC movl %ebp,PT_EBP(%esp) _ASM_EXTABLE(1b,syscall_fault) @@ -488,6 +491,7 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway + ASM_CLAC pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) @@ -670,6 +674,7 @@ END(syscall_exit_work) RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: + ASM_CLAC GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace @@ -825,6 +830,7 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + ASM_CLAC addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF @@ -841,6 +847,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ + ASM_CLAC; \ pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ @@ -857,6 +864,7 @@ ENDPROC(name) ENTRY(coprocessor_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_coprocessor_error jmp error_code @@ -865,6 +873,7 @@ END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ @@ -886,6 +895,7 @@ END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME + ASM_CLAC pushl_cfi $-1 # mark this as an int pushl_cfi $do_device_not_available jmp error_code @@ -906,6 +916,7 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_overflow jmp error_code @@ -914,6 +925,7 @@ END(overflow) ENTRY(bounds) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_bounds jmp error_code @@ -922,6 +934,7 @@ END(bounds) ENTRY(invalid_op) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_invalid_op jmp error_code @@ -930,6 +943,7 @@ END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_coprocessor_segment_overrun jmp error_code @@ -938,6 +952,7 @@ END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC @@ -945,6 +960,7 @@ END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC @@ -952,6 +968,7 @@ END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC @@ -959,6 +976,7 @@ END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC @@ -966,6 +984,7 @@ END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 # no error code pushl_cfi $do_divide_error jmp error_code @@ -975,6 +994,7 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi machine_check_vector jmp error_code @@ -984,6 +1004,7 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_spurious_interrupt_bug jmp error_code @@ -1109,17 +1130,21 @@ ENTRY(ftrace_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax movl 0x4(%ebp), %edx + leal function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call ftrace_call: call ftrace_stub + addl $4,%esp /* skip NULL pointer */ popl %edx popl %ecx popl %eax +ftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: @@ -1131,6 +1156,71 @@ ftrace_stub: ret END(ftrace_caller) +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + +ftrace_restore_flags: + popf + jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) @@ -1171,9 +1261,6 @@ END(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - pushl %eax pushl %ecx pushl %edx @@ -1207,6 +1294,7 @@ return_to_handler: ENTRY(page_fault) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_page_fault ALIGN error_code: @@ -1279,6 +1367,7 @@ END(page_fault) ENTRY(debug) RING0_INT_FRAME + ASM_CLAC cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn @@ -1303,6 +1392,7 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME + ASM_CLAC pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1373,6 +1463,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME + ASM_CLAC pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF @@ -1393,6 +1484,7 @@ END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8c834f..44531acd9a81 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,8 @@ #include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/asm.h> +#include <asm/rcu.h> +#include <asm/smap.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -68,25 +70,51 @@ .section .entry.text, "ax" #ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +#else +# define function_hook mcount +#endif + #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(mcount) + +ENTRY(function_hook) retq -END(mcount) +END(function_hook) + +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 + MCOUNT_SAVE_FRAME \skip + + /* Load the ftrace_ops into the 3rd parameter */ + leaq function_trace_op, %rdx + + /* Load ip into the first parameter */ + movq RIP(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + /* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else + movq 8(%rbp), %rsi +#endif +.endm ENTRY(ftrace_caller) + /* Check if tracing was disabled (quick check) */ cmpl $0, function_trace_stop jne ftrace_stub - MCOUNT_SAVE_FRAME - - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi + ftrace_caller_setup + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub MCOUNT_RESTORE_FRAME +ftrace_return: #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -97,8 +125,78 @@ GLOBAL(ftrace_stub) retq END(ftrace_caller) +ENTRY(ftrace_regs_caller) + /* Save the current flags before compare (in SS location)*/ + pushfq + + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* skip=8 to skip flags saved in SS */ + ftrace_caller_setup 8 + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbp, RBP(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq SS(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address */ + leaq SS+16(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, SS(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, SS+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBP(%rsp), %rbp + movq RBX(%rsp), %rbx + + /* skip=8 to skip flags saved in SS */ + MCOUNT_RESTORE_FRAME 8 + + /* Restore flags */ + popfq + + jmp ftrace_return +ftrace_restore_flags: + popfq + jmp ftrace_stub + +END(ftrace_regs_caller) + + #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(mcount) + +ENTRY(function_hook) cmpl $0, function_trace_stop jne ftrace_stub @@ -119,8 +217,12 @@ GLOBAL(ftrace_stub) trace: MCOUNT_SAVE_FRAME - movq 0x38(%rsp), %rdi + movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else movq 8(%rbp), %rsi +#endif subq $MCOUNT_INSN_SIZE, %rdi call *ftrace_trace_function @@ -128,20 +230,22 @@ trace: MCOUNT_RESTORE_FRAME jmp ftrace_stub -END(mcount) +END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - MCOUNT_SAVE_FRAME +#ifdef CC_USING_FENTRY + leaq SS+16(%rsp), %rdi + movq $0, %rdx /* No framepointers needed */ +#else leaq 8(%rbp), %rdi - movq 0x38(%rsp), %rsi movq (%rbp), %rdx +#endif + movq RIP(%rsp), %rsi subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return @@ -342,15 +446,15 @@ ENDPROC(native_usergs_sysret64) .macro SAVE_ARGS_IRQ cld /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, RDI-RBP - movq_cfi rsi, RSI-RBP - movq_cfi rdx, RDX-RBP - movq_cfi rcx, RCX-RBP - movq_cfi rax, RAX-RBP - movq_cfi r8, R8-RBP - movq_cfi r9, R9-RBP - movq_cfi r10, R10-RBP - movq_cfi r11, R11-RBP + movq_cfi rdi, (RDI-RBP) + movq_cfi rsi, (RSI-RBP) + movq_cfi rdx, (RDX-RBP) + movq_cfi rcx, (RCX-RBP) + movq_cfi rax, (RAX-RBP) + movq_cfi r8, (R8-RBP) + movq_cfi r9, (R9-RBP) + movq_cfi r10, (R10-RBP) + movq_cfi r11, (R11-RBP) /* Save rbp so that we can unwind from get_irq_regs() */ movq_cfi rbp, 0 @@ -384,7 +488,7 @@ ENDPROC(native_usergs_sysret64) .endm ENTRY(save_rest) - PARTIAL_FRAME 1 REST_SKIP+8 + PARTIAL_FRAME 1 (REST_SKIP+8) movq 5*8+16(%rsp), %r11 /* save return address */ movq_cfi rbx, RBX+16 movq_cfi rbp, RBP+16 @@ -440,7 +544,7 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - pushq_cfi kernel_eflags(%rip) + pushq_cfi $0x0002 popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -465,7 +569,8 @@ END(ret_from_fork) * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the - * stack pointer. + * stack pointer. However, it does mask the flags register for us, so + * CLD and CLAC are not needed. */ /* @@ -565,7 +670,7 @@ sysret_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi jmp sysret_check @@ -678,7 +783,7 @@ int_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -884,6 +989,7 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + ASM_CLAC XCPT_FRAME addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ @@ -974,7 +1080,7 @@ retint_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) @@ -1023,6 +1129,7 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME pushq_cfi $~(\num) .Lcommon_\sym: @@ -1077,6 +1184,7 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1094,6 +1202,7 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1112,6 +1221,7 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1131,6 +1241,7 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) + ASM_CLAC XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp @@ -1149,6 +1260,7 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) + ASM_CLAC XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp @@ -1449,7 +1561,7 @@ paranoid_userspace: paranoid_schedule: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) - call schedule + SCHEDULE_USER DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF jmp paranoid_userspace diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c3a7cb4bf6e6..1d414029f1d8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -206,6 +206,21 @@ static int ftrace_modify_code(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code); +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ret = ftrace_modify_code(ip, old, new); + /* Also update the regs callback function */ + if (!ret) { + ip = (unsigned long)(&ftrace_regs_call); + memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + } + atomic_dec(&modifying_ftrace_code); return ret; @@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } +/* + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + */ +static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/* + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + */ +static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int add_breakpoints(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; @@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) /* converting nop to call */ return add_brk_on_nop(rec); + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + ftrace_addr = get_ftrace_old_addr(rec); + /* fall through */ case FTRACE_UPDATE_MAKE_NOP: /* converting a call to a nop */ return add_brk_on_call(rec, ftrace_addr); @@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec) * If not, don't touch the breakpoint, we make just create * a disaster. */ - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) + goto update; + + /* Check both ftrace_addr and ftrace_old_addr */ + ftrace_addr = get_ftrace_old_addr(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) return -EINVAL; } + update: return probe_kernel_write((void *)ip, &nop[0], 1); } @@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return add_update_call(rec, ftrace_addr); @@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return finish_update_call(rec, ftrace_addr); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d42ab17b7397..957a47aec64e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -287,27 +287,28 @@ ENTRY(startup_32_smp) leal -__PAGE_OFFSET(%ecx),%esp default_entry: - /* * New page tables may be in 4Mbyte page mode and may * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * So we do not try to touch it unless we really have - * some bits in it to set. This won't work if the BSP - * implements cr4 but this AP does not -- very unlikely - * but be warned! The same applies to the pse feature - * if not equally supported. --macro - * - * NOTE! We have to correct for the fact that we're - * not yet offset PAGE_OFFSET.. + * Specifically, cr4 exists if and only if CPUID exists, + * which in turn exists if and only if EFLAGS.ID exists. */ -#define cr4_bits pa(mmu_cr4_features) - movl cr4_bits,%edx - andl %edx,%edx - jz 6f - movl %cr4,%eax # Turn on paging options (PSE,PAE,..) - orl %edx,%eax + movl $X86_EFLAGS_ID,%ecx + pushl %ecx + popfl + pushfl + popl %eax + pushl $0 + popfl + pushfl + popl %edx + xorl %edx,%eax + testl %ecx,%eax + jz 6f # No ID flag = no CPUID = no CR4 + + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f250431fb505..675a05012449 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -19,24 +19,17 @@ #include <asm/fpu-internal.h> #include <asm/user.h> -#ifdef CONFIG_X86_64 -# include <asm/sigcontext32.h> -# include <asm/user32.h> -#else -# define save_i387_xstate_ia32 save_i387_xstate -# define restore_i387_xstate_ia32 restore_i387_xstate -# define _fpstate_ia32 _fpstate -# define _xstate_ia32 _xstate -# define sig_xstate_ia32_size sig_xstate_size -# define fx_sw_reserved_ia32 fx_sw_reserved -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -#endif - /* * Were we in an interrupt that interrupted kernel mode? * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * For now, with eagerfpu we will return interrupted kernel FPU + * state as not-idle. TBD: Ideally we can change the return value + * to something like __thread_has_fpu(current). But we need to + * be careful of doing __thread_clear_has_fpu() before saving + * the FPU etc for supporting nested uses etc. For now, take + * the simple route! + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that * pair does nothing at all: the thread must not have fpu (so * that we don't try to save the FPU state), and TS must * be set (so that the clts/stts pair does nothing that is @@ -44,6 +37,9 @@ */ static inline bool interrupted_kernel_fpu_idle(void) { + if (use_eager_fpu()) + return 0; + return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); } @@ -77,29 +73,29 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -void kernel_fpu_begin(void) +void __kernel_fpu_begin(void) { struct task_struct *me = current; - WARN_ON_ONCE(!irq_fpu_usable()); - preempt_disable(); if (__thread_has_fpu(me)) { __save_init_fpu(me); __thread_clear_has_fpu(me); - /* We do 'stts()' in kernel_fpu_end() */ - } else { + /* We do 'stts()' in __kernel_fpu_end() */ + } else if (!use_eager_fpu()) { this_cpu_write(fpu_owner_task, NULL); clts(); } } -EXPORT_SYMBOL(kernel_fpu_begin); +EXPORT_SYMBOL(__kernel_fpu_begin); -void kernel_fpu_end(void) +void __kernel_fpu_end(void) { - stts(); - preempt_enable(); + if (use_eager_fpu()) + math_state_restore(); + else + stts(); } -EXPORT_SYMBOL(kernel_fpu_end); +EXPORT_SYMBOL(__kernel_fpu_end); void unlazy_fpu(struct task_struct *tsk) { @@ -113,23 +109,15 @@ void unlazy_fpu(struct task_struct *tsk) } EXPORT_SYMBOL(unlazy_fpu); -#ifdef CONFIG_MATH_EMULATION -# define HAVE_HWFP (boot_cpu_data.hard_math) -#else -# define HAVE_HWFP 1 -#endif - -static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; - clts(); if (cpu_has_fxsr) { memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); asm volatile("fxsave %0" : : "m" (fx_scratch)); @@ -138,7 +126,6 @@ static void __cpuinit mxcsr_feature_mask_init(void) mask = 0x0000ffbf; } mxcsr_feature_mask &= mask; - stts(); } static void __cpuinit init_thread_xstate(void) @@ -192,9 +179,8 @@ void __cpuinit fpu_init(void) init_thread_xstate(); mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); + xsave_init(); + eager_fpu_init(); } void fpu_finit(struct fpu *fpu) @@ -205,12 +191,7 @@ void fpu_finit(struct fpu *fpu) } if (cpu_has_fxsr) { - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - - memset(fx, 0, xstate_size); - fx->cwd = 0x37f; - if (cpu_has_xmm) - fx->mxcsr = MXCSR_DEFAULT; + fx_finit(&fpu->state->fxsave); } else { struct i387_fsave_struct *fp = &fpu->state->fsave; memset(fp, 0, xstate_size); @@ -454,7 +435,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) * FXSR floating point environment conversions. */ -static void +void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -491,8 +472,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } -static void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env) +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) { struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -589,223 +570,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, } /* - * Signal frame handlers. - */ - -static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave; - - fp->status = fp->swd; - if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) - return -1; - return 1; -} - -static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; - struct user_i387_ia32_struct env; - int err = 0; - - convert_from_fxsr(&env, tsk); - if (__copy_to_user(buf, &env, sizeof(env))) - return -1; - - err |= __put_user(fx->swd, &buf->status); - err |= __put_user(X86_FXSR_MAGIC, &buf->magic); - if (err) - return -1; - - if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) - return -1; - return 1; -} - -static int save_i387_xsave(void __user *buf) -{ - struct task_struct *tsk = current; - struct _fpstate_ia32 __user *fx = buf; - int err = 0; - - - sanitize_i387_state(tsk); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. - * This will enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware applications can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; - - if (save_i387_fxsave(fx) < 0) - return -1; - - err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, - sizeof(struct _fpx_sw_bytes)); - err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *) (buf + sig_xstate_ia32_size - - FP_XSTATE_MAGIC2_SIZE)); - if (err) - return -1; - - return 1; -} - -int save_i387_xstate_ia32(void __user *buf) -{ - struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) - return -EACCES; - /* - * This will cause a "finit" to be triggered by the next - * attempted FPU operation by the 'current' process. - */ - clear_used_math(); - - if (!HAVE_HWFP) { - return fpregs_soft_get(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, fp) ? -1 : 1; - } - - unlazy_fpu(tsk); - - if (cpu_has_xsave) - return save_i387_xsave(fp); - if (cpu_has_fxsr) - return save_i387_fxsave(fp); - else - return save_i387_fsave(fp); -} - -static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - - return __copy_from_user(&tsk->thread.fpu.state->fsave, buf, - sizeof(struct i387_fsave_struct)); -} - -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, - unsigned int size) -{ - struct task_struct *tsk = current; - struct user_i387_ia32_struct env; - int err; - - err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0], - size); - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - if (err || __copy_from_user(&env, buf, sizeof(env))) - return 1; - convert_to_fxsr(tsk, &env); - - return 0; -} - -static int restore_i387_xsave(void __user *buf) -{ - struct _fpx_sw_bytes fx_sw_user; - struct _fpstate_ia32 __user *fx_user = - ((struct _fpstate_ia32 __user *) buf); - struct i387_fxsave_struct __user *fx = - (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; - struct xsave_hdr_struct *xsave_hdr = - ¤t->thread.fpu.state->xsave.xsave_hdr; - u64 mask; - int err; - - if (check_for_xstate(fx, buf, &fx_sw_user)) - goto fx_only; - - mask = fx_sw_user.xstate_bv; - - err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); - - xsave_hdr->xstate_bv &= pcntxt_mask; - /* - * These bits must be zero. - */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; - - /* - * Init the state that is not present in the memory layout - * and enabled by the OS. - */ - mask = ~(pcntxt_mask & ~mask); - xsave_hdr->xstate_bv &= mask; - - return err; -fx_only: - /* - * Couldn't find the extended state information in the memory - * layout. Restore the FP/SSE and init the other extended state - * enabled by the OS. - */ - xsave_hdr->xstate_bv = XSTATE_FPSSE; - return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); -} - -int restore_i387_xstate_ia32(void __user *buf) -{ - int err; - struct task_struct *tsk = current; - struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; - - if (HAVE_HWFP) - clear_fpu(tsk); - - if (!buf) { - if (used_math()) { - clear_fpu(tsk); - clear_used_math(); - } - - return 0; - } else - if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) - return -EACCES; - - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } - - if (HAVE_HWFP) { - if (cpu_has_xsave) - err = restore_i387_xsave(buf); - else if (cpu_has_fxsr) - err = restore_i387_fxsave(fp, sizeof(struct - i387_fxsave_struct)); - else - err = restore_i387_fsave(fp); - } else { - err = fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, fp) != 0; - } - set_used_math(); - - return err; -} - -/* * FPU state for core dumps. * This is only used for a.out dumps now. * It is declared generically using elf_fpregset_t (which is diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 36d1853e91af..9a5c460404dc 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -263,7 +263,7 @@ static void i8259A_shutdown(void) * out of. */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } static struct syscore_ops i8259_syscore_ops = { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d44f7829968e..e4595f105910 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - + irq_stats(j)->irq_tlb_count); seq_printf(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) @@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; - sum += irq_stats(cpu)->irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e2f751efb7b1..57916c0d3cf6 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,6 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } +#ifdef KPROBES_CAN_USE_FTRACE +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); +} +#endif + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -599,6 +616,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { +#ifdef KPROBES_CAN_USE_FTRACE + if (kprobe_ftrace(p)) { + skip_singlestep(p, regs, kcb); + return 1; + } +#endif setup_singlestep(p, regs, kcb, 0); return 1; } @@ -1052,6 +1075,50 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } +#ifdef KPROBES_CAN_USE_FTRACE +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} +#endif + int __init arch_init_kprobes(void) { return arch_init_optprobes(); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 82746f942cd8..7720ff5a9ee2 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -75,20 +75,113 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; -/* page-sized ucode patch buffer */ -void *patch; +struct ucode_patch { + struct list_head plist; + void *data; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int i = 0; + + if (!equiv_cpu_table) + return 0; + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ + int i = 0; + + BUG_ON(!equiv_cpu_table); + + while (equiv_cpu_table[i].equiv_cpu != 0) { + if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) + return equiv_cpu_table[i].installed_cpu; + i++; + } + return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) + /* we already have the latest patch */ + return; + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, &pcache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + csig->sig = cpuid_eax(0x00000001); csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); return 0; } -static unsigned int verify_ucode_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(int cpu, u32 patch_size, unsigned int size) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size, return patch_size; } -static u16 find_equiv_id(void) +static int apply_microcode_amd(int cpu) { - unsigned int current_cpu_id, i = 0; - - BUG_ON(equiv_cpu_table == NULL); - - current_cpu_id = cpuid_eax(0x00000001); - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; -} + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + u32 rev, dummy; -/* - * we signal a good patch is found by returning its size > 0 - */ -static int get_matching_microcode(int cpu, const u8 *ucode_ptr, - unsigned int leftover_size, int rev, - unsigned int *current_size) -{ - struct microcode_header_amd *mc_hdr; - unsigned int actual_size, patch_size; - u16 equiv_cpu_id; + BUG_ON(raw_smp_processor_id() != cpu); - /* size of the current patch we're staring at */ - patch_size = *(u32 *)(ucode_ptr + 4); - *current_size = patch_size + SECTION_HDR_SIZE; + uci = ucode_cpu_info + cpu; - equiv_cpu_id = find_equiv_id(); - if (!equiv_cpu_id) + p = find_patch(cpu); + if (!p) return 0; - /* - * let's look at the patch header itself now - */ - mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); + mc_amd = p->data; + uci->mc = p->data; - if (mc_hdr->processor_rev_id != equiv_cpu_id) - return 0; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* ucode might be chipset specific -- currently we don't support this */ - if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { - pr_err("CPU%d: chipset specific code not yet supported\n", - cpu); + /* need to apply patch? */ + if (rev >= mc_amd->hdr.patch_id) { + c->microcode = rev; return 0; } - if (mc_hdr->patch_id <= rev) - return 0; - - /* - * now that the header looks sane, verify its size - */ - actual_size = verify_ucode_size(cpu, patch_size, leftover_size); - if (!actual_size) - return 0; - - /* clear the patch buffer */ - memset(patch, 0, PAGE_SIZE); - - /* all looks ok, get the binary patch */ - get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); - - return actual_size; -} - -static int apply_microcode_amd(int cpu) -{ - u32 rev, dummy; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - struct microcode_amd *mc_amd = uci->mc; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); - - if (mc_amd == NULL) - return 0; - wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - /* get patch id after patching */ - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* check current patch id and patch's id for match */ + /* verify patch application was successful */ + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev != mc_amd->hdr.patch_id) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); @@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf) return -ENOMEM; } - get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); /* add header length */ return size + CONTAINER_HDR_SZ; @@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void) equiv_cpu_table = NULL; } -static enum ucode_state -generic_load_microcode(int cpu, const u8 *data, size_t size) +static void cleanup(void) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover, current_size = 0; + free_equiv_cpu_table(); + free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + unsigned int patch_size, crnt_size, ret; + u32 proc_fam; + u16 proc_id; + + patch_size = *(u32 *)(fw + 4); + crnt_size = patch_size + SECTION_HDR_SIZE; + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + proc_fam = find_cpu_family_by_equiv_cpu(proc_id); + if (!proc_fam) { + pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); + return crnt_size; + } + + /* check if patch is for the current family */ + proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); + if (proc_fam != c->x86) + return crnt_size; + + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", + mc_hdr->patch_id); + return crnt_size; + } + + ret = verify_patch_size(cpu, patch_size, leftover); + if (!ret) { + pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); + return crnt_size; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kzalloc(patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + + /* All looks ok, copy patch... */ + memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + /* ... and add to cache. */ + update_cache(patch); + + return crnt_size; +} + +static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +{ + enum ucode_state ret = UCODE_ERROR; + unsigned int leftover; + u8 *fw = (u8 *)data; + int crnt_size = 0; int offset; - const u8 *ucode_ptr = data; - void *new_mc = NULL; - unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_ERROR; - offset = install_equiv_cpu_table(ucode_ptr); + offset = install_equiv_cpu_table(data); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - goto out; + return ret; } - ucode_ptr += offset; + fw += offset; leftover = size - offset; - if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { + if (*(u32 *)fw != UCODE_UCODE_TYPE) { pr_err("invalid type field in container file section header\n"); - goto free_table; + free_equiv_cpu_table(); + return ret; } while (leftover) { - mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, - new_rev, ¤t_size); - if (mc_size) { - mc_hdr = patch; - new_mc = patch; - new_rev = mc_hdr->patch_id; - goto out_ok; - } - - ucode_ptr += current_size; - leftover -= current_size; - } + crnt_size = verify_and_add_patch(cpu, fw, leftover); + if (crnt_size < 0) + return ret; - if (!new_mc) { - state = UCODE_NFOUND; - goto free_table; + fw += crnt_size; + leftover -= crnt_size; } -out_ok: - uci->mc = new_mc; - state = UCODE_OK; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); - -free_table: - free_equiv_cpu_table(); - -out: - return state; + return UCODE_OK; } /* @@ -315,7 +402,7 @@ out: * * This legacy file is always smaller than 2K in size. * - * Starting at family 15h they are in family specific firmware files: + * Beginning with family 15h, they are in family-specific firmware files: * * amd-ucode/microcode_amd_fam15h.bin * amd-ucode/microcode_amd_fam16h.bin @@ -323,12 +410,17 @@ out: * * These might be larger than 2K. */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; - const struct firmware *fw; - enum ucode_state ret = UCODE_NFOUND; struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + return UCODE_OK; if (c->x86 >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) goto fw_release; } - ret = generic_load_microcode(cpu, fw->data, fw->size); + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = load_microcode_amd(cpu, fw->data, fw->size); + if (ret != UCODE_OK) + cleanup(); -fw_release: + fw_release: release_firmware(fw); -out: + out: return ret; } @@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void) return NULL; } - patch = (void *)get_zeroed_page(GFP_KERNEL); - if (!patch) - return NULL; - return µcode_amd_ops; } void __exit exit_amd_microcode(void) { - free_page((unsigned long)patch); + cleanup(); } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9e5bcf1e2376..3a04b224d0c0 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -279,19 +279,18 @@ static struct platform_device *microcode_pdev; static int reload_for_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; int err = 0; - if (uci->valid) { - enum ucode_state ustate; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - else - if (ustate == UCODE_ERROR) - err = -EINVAL; - } + if (!uci->valid) + return err; + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; return err; } @@ -373,18 +372,15 @@ static void microcode_fini_cpu(int cpu) static enum ucode_state microcode_resume_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->mc) - return UCODE_NFOUND; - pr_debug("CPU%d updated upon resume\n", cpu); - apply_microcode_on_target(cpu); + + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; return UCODE_OK; } -static enum ucode_state microcode_init_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) { enum ucode_state ustate; @@ -395,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu) if (system_state != SYSTEM_RUNNING) return UCODE_NFOUND; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, + refresh_fw); if (ustate == UCODE_OK) { pr_debug("CPU%d updated upon init\n", cpu); @@ -408,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu) static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; if (uci->valid) - ustate = microcode_resume_cpu(cpu); - else - ustate = microcode_init_cpu(cpu); + return microcode_resume_cpu(cpu); - return ustate; + return microcode_init_cpu(cpu, false); } static int mc_device_add(struct device *dev, struct subsys_interface *sif) @@ -431,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) return -EINVAL; return err; @@ -480,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) struct device *dev; dev = get_cpu_device(cpu); - switch (action) { + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: microcode_update_cpu(cpu); - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); + /* + * "break" is missing on purpose here because we want to fall + * through in order to create the sysfs group. + */ + + case CPU_DOWN_FAILED: if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; + case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; /* + * case CPU_DEAD: + * * When a CPU goes offline, don't free up or invalidate the copy of * the microcode in kernel memory, so that we can reuse it when the * CPU comes back online without unnecessarily requesting the userspace * for it again. */ - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; } + + /* The CPU refused to come up during a system resume */ + if (action == CPU_UP_CANCELED_FROZEN) + microcode_fini_cpu(cpu); + return NOTIFY_OK; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0327e2b3c408..3544aed39338 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index eb113693f043..a7c5661f8496 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -257,12 +257,14 @@ static int __init msr_init(void) goto out_chrdev; } msr_class->devnode = msr_devnode; + get_online_cpus(); for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } register_hotcpu_notifier(&msr_class_cpu_notifier); + put_online_cpus(); err = 0; goto out; @@ -271,6 +273,7 @@ out_class: i = 0; for_each_online_cpu(i) msr_device_destroy(i); + put_online_cpus(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -281,11 +284,13 @@ out: static void __exit msr_exit(void) { int cpu = 0; + get_online_cpus(); for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); + put_online_cpus(); } module_init(msr_init); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 000000000000..e309cc5c276e --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,105 @@ +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_event.h> +#include <linux/bug.h> +#include <linux/stddef.h> +#include <asm/perf_regs.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) + +#ifdef CONFIG_X86_32 +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + if (mask & REG_NOSUPPORT) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 0bc72e2069e3..d5f15c3f7b25 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -150,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev) return oprom; } -void *pci_map_biosrom(struct pci_dev *pdev) +void __iomem *pci_map_biosrom(struct pci_dev *pdev) { struct resource *oprom = find_oprom(pdev); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ef6a8456f719..dc3567e083f9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -66,15 +66,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; - unlazy_fpu(src); - *dst = *src; if (fpu_allocated(&src->thread.fpu)) { memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); ret = fpu_alloc(&dst->thread.fpu); if (ret) return ret; - fpu_copy(&dst->thread.fpu, &src->thread.fpu); + fpu_copy(dst, src); } return 0; } @@ -97,16 +95,6 @@ void arch_task_cache_init(void) SLAB_PANIC | SLAB_NOTRACK, NULL); } -static inline void drop_fpu(struct task_struct *tsk) -{ - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - /* * Free current thread data structures etc.. */ @@ -163,7 +151,13 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - drop_fpu(tsk); + drop_init_fpu(tsk); + /* + * Free the FPU state for non xsave platforms. They get reallocated + * lazily at the first use. + */ + if (!use_eager_fpu()) + free_thread_xstate(tsk); } static void hard_disable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 516fa186121b..b9ff83c7135b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,10 +190,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); } EXPORT_SYMBOL_GPL(start_thread); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0a980c9d7cb8..8a6d20ce1978 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -232,10 +232,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); } void diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c2bf0f..b00b33a18390 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> +#include <linux/rcupdate.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -1332,9 +1333,6 @@ static const struct user_regset_view user_x86_64_view = { #define genregs32_get genregs_get #define genregs32_set genregs_set -#define user_i387_ia32_struct user_i387_struct -#define user32_fxsr_struct user_fxsr_struct - #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1463,6 +1461,8 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + rcu_user_exit(); + /* * If we stepped into a sysenter/syscall insn, it trapped in * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. @@ -1526,4 +1526,6 @@ void syscall_trace_leave(struct pt_regs *regs) !test_thread_flag(TIF_SYSCALL_EMU); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); + + rcu_user_enter(); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80e1b95..4f165479c453 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,9 +961,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - x86_init.paging.pagetable_setup_start(swapper_pg_dir); - paging_init(); - x86_init.paging.pagetable_setup_done(swapper_pg_dir); + x86_init.paging.pagetable_init(); if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908a376e..b33144c8b309 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -114,11 +114,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); get_user_ex(*pax, &sc->ax); } get_user_catch(err); + err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + return err; } @@ -206,35 +207,32 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { /* Default to using normal stack */ + unsigned long math_size = 0; unsigned long sp = regs->sp; + unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); -#ifdef CONFIG_X86_64 /* redzone */ - sp -= 128; -#endif /* CONFIG_X86_64 */ + if (config_enabled(CONFIG_X86_64)) + sp -= 128; if (!onsigstack) { /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (current->sas_ss_size) sp = current->sas_ss_sp + current->sas_ss_size; - } else { -#ifdef CONFIG_X86_32 - /* This is the legacy signal stack switching. */ - if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) + } else if (config_enabled(CONFIG_X86_32) && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; -#endif /* CONFIG_X86_32 */ } } if (used_math()) { - sp -= sig_xstate_size; -#ifdef CONFIG_X86_64 - sp = round_down(sp, 64); -#endif /* CONFIG_X86_64 */ + sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -247,8 +245,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (onsigstack && !likely(on_sig_stack(sp))) return (void __user *)-1L; - /* save i387 state */ - if (used_math() && save_i387_xstate(*fpstate) < 0) + /* save i387 and extended state */ + if (used_math() && + save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; return (void __user *)sp; @@ -357,7 +356,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sig, &frame->sig); put_user_ex(&frame->info, &frame->pinfo); put_user_ex(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); /* Create the ucontext. */ if (cpu_has_xsave) @@ -369,9 +367,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -388,6 +383,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); } put_user_catch(err); + + err |= copy_siginfo_to_user(&frame->info, info); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) return -EFAULT; @@ -436,8 +436,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -450,6 +448,9 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } } put_user_catch(err); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; @@ -474,6 +475,75 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs) +{ +#ifdef CONFIG_X86_X32_ABI + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + put_user_ex(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + put_user_ex(0, &frame->uc.uc__pad0); + + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; +#endif /* CONFIG_X86_X32_ABI */ + + return 0; +} + #ifdef CONFIG_X86_32 /* * Atomically swap in the new signal mask, and wait for a signal. @@ -612,55 +682,22 @@ static int signr_convert(int sig) return sig; } -#ifdef CONFIG_X86_32 - -#define is_ia32 1 -#define ia32_setup_frame __setup_frame -#define ia32_setup_rt_frame __setup_rt_frame - -#else /* !CONFIG_X86_32 */ - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32 test_thread_flag(TIF_IA32) -#else /* !CONFIG_IA32_EMULATION */ -#define is_ia32 0 -#endif /* CONFIG_IA32_EMULATION */ - -#ifdef CONFIG_X86_X32_ABI -#define is_x32 test_thread_flag(TIF_X32) - -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, - struct pt_regs *regs); -#else /* !CONFIG_X86_X32_ABI */ -#define is_x32 0 -#endif /* CONFIG_X86_X32_ABI */ - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); - -#endif /* CONFIG_X86_32 */ - static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { int usig = signr_convert(sig); sigset_t *set = sigmask_to_save(); + compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ - if (is_ia32) { + if (is_ia32_frame()) { if (ka->sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ka, info, set, regs); + return ia32_setup_rt_frame(usig, ka, info, cset, regs); else - return ia32_setup_frame(usig, ka, set, regs); -#ifdef CONFIG_X86_X32_ABI - } else if (is_x32) { - return x32_setup_rt_frame(usig, ka, info, - (compat_sigset_t *)set, regs); -#endif + return ia32_setup_frame(usig, ka, cset, regs); + } else if (is_x32_frame()) { + return x32_setup_rt_frame(usig, ka, info, cset, regs); } else { return __setup_rt_frame(sig, ka, info, set, regs); } @@ -779,6 +816,8 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + rcu_user_exit(); + #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) @@ -804,6 +843,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ + + rcu_user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) @@ -824,72 +865,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, - struct pt_regs *regs) -{ - struct rt_sigframe_x32 __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, info)) - return -EFAULT; - } - - put_user_try { - /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - put_user_ex(0, &frame->uc.uc__pad0); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (ka->sa.sa_flags & SA_RESTORER) { - restorer = ka->sa.sa_restorer; - } else { - /* could use a vstub here */ - restorer = NULL; - err |= -EFAULT; - } - put_user_ex(restorer, &frame->pretcode); - } put_user_catch(err); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; - - /* We use the x32 calling convention here... */ - regs->di = sig; - regs->si = (unsigned long) &frame->info; - regs->dx = (unsigned long) &frame->uc; - - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); - - regs->cs = __USER_CS; - regs->ss = __USER_DS; - - return 0; -} - asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe_x32 __user *frame; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c5a8c314c02..c80a33bc528b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long boot_error = 0; int timeout; - alternatives_smp_switch(1); + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(idle))) - 1); @@ -1053,20 +1054,6 @@ out: preempt_enable(); } -void arch_disable_nonboot_cpus_begin(void) -{ - /* - * Avoid the smp alternatives switch during the disable_nonboot_cpus(). - * In the suspend path, we will be back in the SMP mode shortly anyways. - */ - skip_smp_alternatives = true; -} - -void arch_disable_nonboot_cpus_end(void) -{ - skip_smp_alternatives = false; -} - void arch_enable_nonboot_cpus_begin(void) { set_mtrr_aps_delayed_init(); @@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu) if (per_cpu(cpu_state, cpu) == CPU_DEAD) { if (system_state == SYSTEM_RUNNING) pr_info("CPU %u is now offline\n", cpu); - - if (1 == num_online_cpus()) - alternatives_smp_switch(0); return; } msleep(100); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c346d1161488..cd3b2438a980 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -157,6 +157,33 @@ static int enable_single_step(struct task_struct *child) return 1; } +void set_task_blockstep(struct task_struct *task, bool on) +{ + unsigned long debugctl; + + /* + * Ensure irq/preemption can't change debugctl in between. + * Note also that both TIF_BLOCKSTEP and debugctl should + * be changed atomically wrt preemption. + * FIXME: this means that set/clear TIF_BLOCKSTEP is simply + * wrong if task != current, SIGKILL can wakeup the stopped + * tracee and set/clear can play with the running task, this + * can confuse the next __switch_to_xtra(). + */ + local_irq_disable(); + debugctl = get_debugctlmsr(); + if (on) { + debugctl |= DEBUGCTLMSR_BTF; + set_tsk_thread_flag(task, TIF_BLOCKSTEP); + } else { + debugctl &= ~DEBUGCTLMSR_BTF; + clear_tsk_thread_flag(task, TIF_BLOCKSTEP); + } + if (task == current) + update_debugctlmsr(debugctl); + local_irq_enable(); +} + /* * Enable single or block step. */ @@ -169,19 +196,10 @@ static void enable_step(struct task_struct *child, bool block) * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ - if (enable_single_step(child) && block) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl |= DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - set_tsk_thread_flag(child, TIF_BLOCKSTEP); - } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - clear_tsk_thread_flag(child, TIF_BLOCKSTEP); - } + if (enable_single_step(child) && block) + set_task_blockstep(child, true); + else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); } void user_enable_single_step(struct task_struct *child) @@ -199,13 +217,8 @@ void user_disable_single_step(struct task_struct *child) /* * Make sure block stepping (BTF) is disabled. */ - if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - clear_tsk_thread_flag(child, TIF_BLOCKSTEP); - } + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341c9369..8276dc6794cc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,6 +55,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/mce.h> +#include <asm/rcu.h> #include <asm/mach_traps.h> @@ -107,30 +108,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) +static int __kprobes +do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, + struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; - #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ - if (trapnr < X86_TRAP_UD) - goto vm86_trap; - goto trap_signal; + if (trapnr < X86_TRAP_UD) { + if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + return 0; + } + return -1; } #endif + if (!user_mode(regs)) { + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); + } + return 0; + } - if (!user_mode(regs)) - goto kernel_trap; + return -1; +} -#ifdef CONFIG_X86_32 -trap_signal: -#endif +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + + + if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) + return; /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not @@ -158,33 +174,20 @@ trap_signal: force_sig_info(signr, info, tsk); else force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif } #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ + exception_exit(regs); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -195,11 +198,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ + exception_exit(regs); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, @@ -222,12 +229,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { + exception_enter(regs); if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { + preempt_conditional_sti(regs); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); + } + exception_exit(regs); } dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -235,6 +244,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) static const char str[] = "double fault"; struct task_struct *tsk = current; + exception_enter(regs); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -255,16 +265,29 @@ do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; + exception_enter(regs); conditional_sti(regs); #ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; + if (regs->flags & X86_VM_MASK) { + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + goto exit; + } #endif tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; + if (!user_mode(regs)) { + if (fixup_exception(regs)) + goto exit; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_GP; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) + die("general protection fault", regs, error_code); + goto exit; + } tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -279,25 +302,8 @@ do_general_protection(struct pt_regs *regs, long error_code) } force_sig(SIGSEGV, tsk); - return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (notify_die(DIE_GPF, "general protection fault", regs, error_code, - X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); +exit: + exception_exit(regs); } /* May run on IST stack. */ @@ -312,15 +318,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + exception_enter(regs); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -331,6 +338,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); +exit: + exception_exit(regs); } #ifdef CONFIG_X86_64 @@ -391,6 +400,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; + exception_enter(regs); + get_debugreg(dr6, 6); /* Filter out all the reserved bits which are preset to 1 */ @@ -406,7 +417,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - return; + goto exit; /* DR6 may or may not be cleared by the CPU */ set_debugreg(0, 6); @@ -421,7 +432,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -437,7 +448,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; + goto exit; } /* @@ -458,7 +469,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; +exit: + exception_exit(regs); } /* @@ -555,14 +567,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 ignore_fpu_irq = 1; #endif - + exception_enter(regs); math_error(regs, error_code, X86_TRAP_MF); + exception_exit(regs); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + exception_enter(regs); math_error(regs, error_code, X86_TRAP_XF); + exception_exit(regs); } dotraplinkage void @@ -613,11 +628,12 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); + /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); + drop_init_fpu(tsk); force_sig(SIGSEGV, tsk); return; } @@ -629,6 +645,9 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { + exception_enter(regs); + BUG_ON(use_eager_fpu()); + #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -637,6 +656,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + exception_exit(regs); return; } #endif @@ -644,12 +664,15 @@ do_device_not_available(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 conditional_sti(regs); #endif + exception_exit(regs); } #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; + + exception_enter(regs); local_irq_enable(); info.si_signo = SIGILL; @@ -657,10 +680,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_code = ILL_BADSTK; info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, - X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) - return; - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, - &info); + X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); + } + exception_exit(regs); } #endif diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 36fd42091fa7..9538f00827a9 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -41,6 +41,9 @@ /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x2 +/* Instruction will modify TF, don't change it */ +#define UPROBE_FIX_SETF 0x4 + #define UPROBE_FIX_RIP_AX 0x8000 #define UPROBE_FIX_RIP_CX 0x4000 @@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) insn_get_opcode(insn); /* should be a nop */ switch (OPCODE1(insn)) { + case 0x9d: + /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; case 0xc3: /* ret/lret */ case 0xcb: case 0xc2: @@ -646,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) * Skip these instructions as per the currently known x86 ISA. * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } */ -bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; @@ -673,3 +680,46 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) } return false; } + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + bool ret = __skip_sstep(auprobe, regs); + if (ret && (regs->flags & X86_EFLAGS_TF)) + send_sig(SIGTRAP, current, 0); + return ret; +} + +void arch_uprobe_enable_step(struct arch_uprobe *auprobe) +{ + struct task_struct *task = current; + struct arch_uprobe_task *autask = &task->utask->autask; + struct pt_regs *regs = task_pt_regs(task); + + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) + set_task_blockstep(task, false); +} + +void arch_uprobe_disable_step(struct arch_uprobe *auprobe) +{ + struct task_struct *task = current; + struct arch_uprobe_task *autask = &task->utask->autask; + bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); + struct pt_regs *regs = task_pt_regs(task); + /* + * The state of TIF_BLOCKSTEP was not saved so we can get an extra + * SIGTRAP if we do not clear TF. We need to examine the opcode to + * make it right. + */ + if (unlikely(trapped)) { + if (!autask->saved_tf) + regs->flags &= ~X86_EFLAGS_TF; + } else { + if (autask->saved_tf) + send_sig(SIGTRAP, task, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + } +} diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 6020f6f5927c..1330dd102950 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -13,9 +13,13 @@ #include <asm/ftrace.h> #ifdef CONFIG_FUNCTION_TRACER -/* mcount is defined in assembly */ +/* mcount and __fentry__ are defined in assembly */ +#ifdef CC_USING_FENTRY +EXPORT_SYMBOL(__fentry__); +#else EXPORT_SYMBOL(mcount); #endif +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f3167e891ef..7a3d075a814a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -68,8 +67,7 @@ struct x86_init_ops x86_init __initdata = { }, .paging = { - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, + .pagetable_init = native_pagetable_init, }, .timers = { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3d3e20709119..ada87a329edc 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -10,9 +10,7 @@ #include <linux/compat.h> #include <asm/i387.h> #include <asm/fpu-internal.h> -#ifdef CONFIG_IA32_EMULATION -#include <asm/sigcontext32.h> -#endif +#include <asm/sigframe.h> #include <asm/xcr.h> /* @@ -23,13 +21,9 @@ u64 pcntxt_mask; /* * Represents init state for the supported extended state. */ -static struct xsave_struct *init_xstate_buf; - -struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -struct _fpx_sw_bytes fx_sw_reserved_ia32; -#endif +struct xsave_struct *init_xstate_buf; +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; /* @@ -44,9 +38,9 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; */ void __sanitize_i387_state(struct task_struct *tsk) { - u64 xstate_bv; - int feature_bit = 0x2; struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + int feature_bit = 0x2; + u64 xstate_bv; if (!fx) return; @@ -104,213 +98,326 @@ void __sanitize_i387_state(struct task_struct *tsk) * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw_user) +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct i387_fxsave_struct) + sizeof(struct xsave_hdr_struct); unsigned int magic2; - int err; - err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], - sizeof(struct _fpx_sw_bytes)); - if (err) - return -EFAULT; + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; - /* - * First Magic check failed. - */ - if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -EINVAL; + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; /* - * Check for error scenarios. - */ - if (fx_sw_user->xstate_size < min_xstate_size || - fx_sw_user->xstate_size > xstate_size || - fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -EINVAL; - - err = __get_user(magic2, (__u32 *) (((void *)fpstate) + - fx_sw_user->extended_size - - FP_XSTATE_MAGIC2_SIZE)); - if (err) - return err; - /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (magic2 != FP_XSTATE_MAGIC2) - return -EFAULT; + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; return 0; } -#ifdef CONFIG_X86_64 /* * Signal frame handlers. */ - -int save_i387_xstate(void __user *buf) +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) { - struct task_struct *tsk = current; - int err = 0; - - if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) - return -EACCES; + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; - BUG_ON(sig_xstate_size < xstate_size); + convert_from_fxsr(&env, tsk); - if ((unsigned long)buf % 64) - pr_err("%s: bad fpstate %p\n", __func__, buf); - - if (!used_math()) - return 0; - - if (user_has_fpu()) { - if (use_xsave()) - err = xsave_user(buf); - else - err = fxsave_user(buf); - - if (err) - return err; - user_fpu_end(); + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; } else { - sanitize_i387_state(tsk); - if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, - xstate_size)) + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) return -1; } - clear_used_math(); /* trigger finit */ + return 0; +} - if (use_xsave()) { - struct _fpstate __user *fx = buf; - struct _xstate __user *x = buf; - u64 xstate_bv; +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xstate_bv; + int err; - err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, - sizeof(struct _fpx_sw_bytes)); + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *) (buf + sig_xstate_size - - FP_XSTATE_MAGIC2_SIZE)); + if (!use_xsave()) + return err; - /* - * Read the xstate_bv which we copied (directly from the cpu or - * from the state in task struct) to the user buffers and - * set the FP/SSE bits. - */ - err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware apps can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - xstate_bv |= XSTATE_FPSSE; + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; - if (err) - return err; - } + err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - return 1; + return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = xsave_user(buf); + else if (use_fxsr()) + err = fxsave_user((struct i387_fxsave_struct __user *) buf); + else + err = fsave_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; } /* - * Restore the extended state if present. Otherwise, restore the FP/SSE - * state. + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. */ -static int restore_user_xstate(void __user *buf) +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) { - struct _fpx_sw_bytes fx_sw_user; - u64 mask; - int err; + struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); - if (((unsigned long)buf % 64) || - check_for_xstate(buf, buf, &fx_sw_user)) - goto fx_only; + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); - mask = fx_sw_user.xstate_bv; + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; - /* - * restore the state passed by the user. - */ - err = xrestore_user(buf, mask); - if (err) - return err; + if (!HAVE_HWFP) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; - /* - * init the state skipped by the user. - */ - mask = pcntxt_mask & ~mask; - if (unlikely(mask)) - xrstor_state(init_xstate_buf, mask); + if (user_has_fpu()) { + /* Save the live register state to the user directly. */ + if (save_user_xstate(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + fpu_fxsave(&tsk->thread.fpu); + } else { + sanitize_i387_state(tsk); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + drop_init_fpu(tsk); /* trigger finit */ return 0; +} -fx_only: - /* - * couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); - return fxrstor_checking((__force struct i387_fxsave_struct *)buf); +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xstate_bv, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; + + if (use_xsave()) { + /* These bits must be zero. */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + xsave_hdr->xstate_bv = XSTATE_FPSSE; + else + xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } } /* - * This restores directly out of user space. Exceptions are handled. + * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -int restore_i387_xstate(void __user *buf) +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) { + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; + xrstor_state(init_xstate_buf, init_bv); + return fxrstor_user(buf); + } else { + u64 init_bv = pcntxt_mask & ~xbv; + if (unlikely(init_bv)) + xrstor_state(init_xstate_buf, init_bv); + return xrestore_user(buf, xbv); + } + } else if (use_fxsr()) { + return fxrstor_user(buf); + } else + return frstor_user(buf); +} + +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; - int err = 0; + int state_size = xstate_size; + u64 xstate_bv = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - if (used_math()) - goto clear; + drop_init_fpu(tsk); return 0; - } else - if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) - return -EACCES; + } - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + if (!used_math() && init_fpu(tsk)) + return -1; + + if (!HAVE_HWFP) { + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; } - user_fpu_begin(); - if (use_xsave()) - err = restore_user_xstate(buf); - else - err = fxrstor_checking((__force struct i387_fxsave_struct *) - buf); - if (unlikely(err)) { + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xstate_bv = fx_sw_user.xstate_bv; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears used_math(). This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * set_used_math() is again set. + */ + drop_fpu(tsk); + + if (__copy_from_user(xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + set_used_math(); + } + + if (use_eager_fpu()) + math_state_restore(); + + return err; + } else { /* - * Encountered an error while doing the restore from the - * user buffer, clear the fpu state. + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). */ -clear: - clear_fpu(tsk); - clear_used_math(); + user_fpu_begin(); + if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { + drop_init_fpu(tsk); + return -1; + } } - return err; + + return 0; } -#endif /* * Prepare the SW reserved portion of the fxsave memory layout, indicating @@ -321,31 +428,22 @@ clear: */ static void prepare_fx_sw_frame(void) { - int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + - FP_XSTATE_MAGIC2_SIZE; + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - sig_xstate_size = sizeof(struct _fpstate) + size_extended; - -#ifdef CONFIG_IA32_EMULATION - sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; -#endif - - memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = sig_xstate_size; + fx_sw_reserved.extended_size = size; fx_sw_reserved.xstate_bv = pcntxt_mask; fx_sw_reserved.xstate_size = xstate_size; -#ifdef CONFIG_IA32_EMULATION - memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, - sizeof(struct _fpx_sw_bytes)); - fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; -#endif -} -#ifdef CONFIG_X86_64 -unsigned int sig_xstate_size = sizeof(struct _fpstate); -#endif + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} /* * Enable the extended processor state save/restore feature @@ -384,19 +482,21 @@ static void __init setup_xstate_features(void) /* * setup the xstate image representing the init state */ -static void __init setup_xstate_init(void) +static void __init setup_init_fpu_buf(void) { - setup_xstate_features(); - /* * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave */ init_xstate_buf = alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); - init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + fx_finit(&init_xstate_buf->i387); + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); - clts(); /* * Init all the features state with header_bv being 0x0 */ @@ -406,9 +506,21 @@ static void __init setup_xstate_init(void) * of any feature which is not represented by all zero's. */ xsave_state(init_xstate_buf, -1); - stts(); } +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + eagerfpu = ENABLE; + else if (!strcmp(s, "off")) + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + /* * Enable and initialize the xsave feature. */ @@ -445,8 +557,11 @@ static void __init xstate_enable_boot_cpu(void) update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); + setup_init_fpu_buf(); - setup_xstate_init(); + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); @@ -471,3 +586,43 @@ void __cpuinit xsave_init(void) next_func = xstate_enable; this_func(); } + +static inline void __init eager_fpu_init_bp(void) +{ + current->thread.fpu.state = + alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); + if (!init_xstate_buf) + setup_init_fpu_buf(); +} + +void __cpuinit eager_fpu_init(void) +{ + static __refdata void (*boot_func)(void) = eager_fpu_init_bp; + + clear_used_math(); + current_thread_info()->status = 0; + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + + if (!cpu_has_eager_fpu) { + stts(); + return; + } + + if (boot_func) { + boot_func(); + boot_func = NULL; + } + + /* + * This is same as math_state_restore(). But use_xsave() is + * not yet patched to use math_state_restore(). + */ + init_fpu(current); + __thread_fpu_begin(current); + if (cpu_has_xsave) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); +} diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a71faf727ff3..bca63f04dccb 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -183,95 +183,6 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 -#define VMX_EXIT_REASONS \ - { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ - { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ - { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ - { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ - { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ - { EXIT_REASON_CPUID, "CPUID" }, \ - { EXIT_REASON_HLT, "HLT" }, \ - { EXIT_REASON_INVLPG, "INVLPG" }, \ - { EXIT_REASON_RDPMC, "RDPMC" }, \ - { EXIT_REASON_RDTSC, "RDTSC" }, \ - { EXIT_REASON_VMCALL, "VMCALL" }, \ - { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ - { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ - { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ - { EXIT_REASON_VMPTRST, "VMPTRST" }, \ - { EXIT_REASON_VMREAD, "VMREAD" }, \ - { EXIT_REASON_VMRESUME, "VMRESUME" }, \ - { EXIT_REASON_VMWRITE, "VMWRITE" }, \ - { EXIT_REASON_VMOFF, "VMOFF" }, \ - { EXIT_REASON_VMON, "VMON" }, \ - { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ - { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ - { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ - { EXIT_REASON_MSR_READ, "MSR_READ" }, \ - { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ - { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ - { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ - { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ - { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ - { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ - { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ - { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } - -#define SVM_EXIT_REASONS \ - { SVM_EXIT_READ_CR0, "read_cr0" }, \ - { SVM_EXIT_READ_CR3, "read_cr3" }, \ - { SVM_EXIT_READ_CR4, "read_cr4" }, \ - { SVM_EXIT_READ_CR8, "read_cr8" }, \ - { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ - { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ - { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ - { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ - { SVM_EXIT_READ_DR0, "read_dr0" }, \ - { SVM_EXIT_READ_DR1, "read_dr1" }, \ - { SVM_EXIT_READ_DR2, "read_dr2" }, \ - { SVM_EXIT_READ_DR3, "read_dr3" }, \ - { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ - { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ - { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ - { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ - { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ - { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ - { SVM_EXIT_INTR, "interrupt" }, \ - { SVM_EXIT_NMI, "nmi" }, \ - { SVM_EXIT_SMI, "smi" }, \ - { SVM_EXIT_INIT, "init" }, \ - { SVM_EXIT_VINTR, "vintr" }, \ - { SVM_EXIT_CPUID, "cpuid" }, \ - { SVM_EXIT_INVD, "invd" }, \ - { SVM_EXIT_HLT, "hlt" }, \ - { SVM_EXIT_INVLPG, "invlpg" }, \ - { SVM_EXIT_INVLPGA, "invlpga" }, \ - { SVM_EXIT_IOIO, "io" }, \ - { SVM_EXIT_MSR, "msr" }, \ - { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ - { SVM_EXIT_SHUTDOWN, "shutdown" }, \ - { SVM_EXIT_VMRUN, "vmrun" }, \ - { SVM_EXIT_VMMCALL, "hypercall" }, \ - { SVM_EXIT_VMLOAD, "vmload" }, \ - { SVM_EXIT_VMSAVE, "vmsave" }, \ - { SVM_EXIT_STGI, "stgi" }, \ - { SVM_EXIT_CLGI, "clgi" }, \ - { SVM_EXIT_SKINIT, "skinit" }, \ - { SVM_EXIT_WBINVD, "wbinvd" }, \ - { SVM_EXIT_MONITOR, "monitor" }, \ - { SVM_EXIT_MWAIT, "mwait" }, \ - { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } - /* * Tracepoint for kvm guest exit: */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b1eb202ee76a..851aa7c3b890 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1493,8 +1493,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (user_has_fpu()) - clts(); + /* + * If the FPU is not active (through the host task or + * the guest vcpu), then restore the cr0.TS bit. + */ + if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + stts(); load_gdt(&__get_cpu_var(host_gdt)); } @@ -3743,7 +3747,7 @@ static void vmx_set_constant_host_state(void) unsigned long tmpl; struct desc_ptr dt; - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ @@ -4543,7 +4547,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } - }; + } break; case 2: /* clts */ handle_clts(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2966c847d489..1f09552572fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5979,7 +5979,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); + __kernel_fpu_begin(); fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5993,6 +5993,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); + __kernel_fpu_end(); ++vcpu->stat.fpu_reload; kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 5b2995f4557a..a30ca15be21c 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,6 +17,7 @@ #include <asm/cpufeature.h> #include <asm/alternative-asm.h> #include <asm/asm.h> +#include <asm/smap.h> /* * By placing feature2 after feature1 in altinstructions section, we logically @@ -130,6 +131,7 @@ ENDPROC(bad_from_user) */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC + ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -177,6 +179,7 @@ ENTRY(copy_user_generic_unrolled) decl %ecx jnz 21b 23: xor %eax,%eax + ASM_CLAC ret .section .fixup,"ax" @@ -232,6 +235,7 @@ ENDPROC(copy_user_generic_unrolled) */ ENTRY(copy_user_generic_string) CFI_STARTPROC + ASM_STAC andl %edx,%edx jz 4f cmpl $8,%edx @@ -246,6 +250,7 @@ ENTRY(copy_user_generic_string) 3: rep movsb 4: xorl %eax,%eax + ASM_CLAC ret .section .fixup,"ax" @@ -273,12 +278,14 @@ ENDPROC(copy_user_generic_string) */ ENTRY(copy_user_enhanced_fast_string) CFI_STARTPROC + ASM_STAC andl %edx,%edx jz 2f movl %edx,%ecx 1: rep movsb 2: xorl %eax,%eax + ASM_CLAC ret .section .fixup,"ax" diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index cacddc7163eb..6a4f43c2d9e6 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -15,6 +15,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/asm.h> +#include <asm/smap.h> .macro ALIGN_DESTINATION #ifdef FIX_ALIGNMENT @@ -48,6 +49,7 @@ */ ENTRY(__copy_user_nocache) CFI_STARTPROC + ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -95,6 +97,7 @@ ENTRY(__copy_user_nocache) decl %ecx jnz 21b 23: xorl %eax,%eax + ASM_CLAC sfence ret diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b33b1fb1e6d4..156b9c804670 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -33,6 +33,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/asm.h> +#include <asm/smap.h> .text ENTRY(__get_user_1) @@ -40,8 +41,10 @@ ENTRY(__get_user_1) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 1: movzb (%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_1) @@ -53,8 +56,10 @@ ENTRY(__get_user_2) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 2: movzwl -1(%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_2) @@ -66,8 +71,10 @@ ENTRY(__get_user_4) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 3: mov -3(%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_4) @@ -80,8 +87,10 @@ ENTRY(__get_user_8) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 4: movq -7(%_ASM_AX),%_ASM_DX xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_8) @@ -91,6 +100,7 @@ bad_get_user: CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX + ASM_CLAC ret CFI_ENDPROC END(bad_get_user) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index b1e6c4b2e8eb..54fcffed28ed 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -18,7 +18,11 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#ifdef __KERNEL__ #include <linux/string.h> +#else +#include <string.h> +#endif #include <asm/inat.h> #include <asm/insn.h> diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 7f951c8f76c4..fc6ba17a7eec 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -15,6 +15,7 @@ #include <asm/thread_info.h> #include <asm/errno.h> #include <asm/asm.h> +#include <asm/smap.h> /* @@ -31,7 +32,8 @@ #define ENTER CFI_STARTPROC ; \ GET_THREAD_INFO(%_ASM_BX) -#define EXIT ret ; \ +#define EXIT ASM_CLAC ; \ + ret ; \ CFI_ENDPROC .text @@ -39,6 +41,7 @@ ENTRY(__put_user_1) ENTER cmp TI_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user + ASM_STAC 1: movb %al,(%_ASM_CX) xor %eax,%eax EXIT @@ -50,6 +53,7 @@ ENTRY(__put_user_2) sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 2: movw %ax,(%_ASM_CX) xor %eax,%eax EXIT @@ -61,6 +65,7 @@ ENTRY(__put_user_4) sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 3: movl %eax,(%_ASM_CX) xor %eax,%eax EXIT @@ -72,6 +77,7 @@ ENTRY(__put_user_8) sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 4: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 5: movl %edx,4(%_ASM_CX) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1781b2f950e2..98f6d6b68f5a 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -42,10 +42,11 @@ do { \ int __d0; \ might_fault(); \ __asm__ __volatile__( \ + ASM_STAC "\n" \ "0: rep; stosl\n" \ " movl %2,%0\n" \ "1: rep; stosb\n" \ - "2:\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ @@ -626,10 +627,12 @@ survive: return n; } #endif + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_to_user_ll); @@ -637,10 +640,12 @@ EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user_zeroing(to, from, n); else n = __copy_user_zeroing_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll); @@ -648,11 +653,13 @@ EXPORT_SYMBOL(__copy_from_user_ll); unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel((void __user *)to, (const void *)from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nozero); @@ -660,6 +667,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero); unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && cpu_has_xmm2) n = __copy_user_zeroing_intel_nocache(to, from, n); @@ -668,6 +676,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, #else __copy_user_zeroing(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache); @@ -675,6 +684,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && cpu_has_xmm2) n = __copy_user_intel_nocache(to, from, n); @@ -683,6 +693,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr #else __copy_user(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e5b130bc2d0e..05928aae911e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -18,6 +18,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ + stac(); asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" @@ -40,6 +41,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); + clac(); return size; } EXPORT_SYMBOL(__clear_user); @@ -82,5 +84,6 @@ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) if (__put_user_nocheck(c, to++, sizeof(char))) break; + clac(); return len; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d8e0bc..a530b230e7d7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ +#include <asm/rcu.h> /* exception_enter(), ... */ /* * Page fault error code bits: @@ -995,13 +996,24 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ + if (error_code & PF_USER) + return false; + + if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + return false; + + return true; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static void __kprobes +__do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -1088,6 +1100,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + if (static_cpu_has(X86_FEATURE_SMAP)) { + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1209,3 +1228,11 @@ good_area: up_read(&mm->mmap_sem); } + +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + exception_enter(regs); + __do_page_fault(regs, error_code); + exception_exit(regs); +} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f85ce4..11a58001b4ce 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_init(void) { unsigned long pfn, va; - pgd_t *pgd; + pgd_t *pgd, *base = swapper_pg_dir; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -475,10 +475,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); -} - -void __init native_pagetable_setup_done(pgd_t *base) -{ + paging_init(); } /* @@ -493,7 +490,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir * appropriately for the rest of the initialization to work. * * In general, pagetable_init() assumes that the pagetable may already @@ -712,7 +709,7 @@ static void __init test_wp_bit(void) "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); boot_cpu_data.wp_works_ok = do_test_wp_bit(); clear_fixmap(FIX_WP_TEST); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 613cd83e8c0c..0777f042e400 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -98,6 +98,8 @@ static void flush_tlb_func(void *info) { struct flush_tlb_info *f = info; + inc_irq_stat(irq_tlb_count); + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; @@ -320,7 +322,7 @@ static ssize_t tlbflush_write_file(struct file *file, if (kstrtos8(buf, 0, &shift)) return -EINVAL; - if (shift > 64) + if (shift < -1 || shift >= BITS_PER_LONG) return -EINVAL; tlb_flushall_shift = shift; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 33643a8bcbbb..520d2bd0b9c5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -280,6 +280,31 @@ void bpf_jit_compile(struct sk_filter *fp) } EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ break; + case BPF_S_ALU_MOD_X: /* A %= X; */ + seen |= SEEN_XREG; + EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is start address of target + * (addrs[i] - 6) is the address following this jmp + * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long) + */ + EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - + (addrs[i] - 6)); + } else { + EMIT_COND_JMP(X86_JNE, 2 + 5); + CLEAR_A(); + EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */ + } + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT2(0xf7, 0xf3); /* div %ebx */ + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; + case BPF_S_ALU_MOD_K: /* A %= K; */ + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ + EMIT2(0xf7, 0xf1); /* div %ecx */ + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ EMIT(K, 4); @@ -310,9 +335,18 @@ void bpf_jit_compile(struct sk_filter *fp) EMIT1_off32(0x0d, K); /* or imm32,%eax */ break; case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ + case BPF_S_ALU_XOR_X: seen |= SEEN_XREG; EMIT2(0x31, 0xd8); /* xor %ebx,%eax */ break; + case BPF_S_ALU_XOR_K: /* A ^= K; */ + if (K == 0) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xf0, K); /* xor imm8,%eax */ + else + EMIT1_off32(0x35, K); /* xor imm32,%eax */ + break; case BPF_S_ALU_LSH_X: /* A <<= X; */ seen |= SEEN_XREG; EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 505acdd6d600..192397c98606 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -305,7 +305,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->flags = flags; res->start = start; res->end = end; - res->child = NULL; if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, @@ -434,7 +433,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, size = sizeof(*info->res) * info->res_num; info->res_num = 0; - info->res = kmalloc(size, GFP_KERNEL); + info->res = kzalloc(size, GFP_KERNEL); if (!info->res) return; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 937bcece7006..704b9ec043d7 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -585,7 +585,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) while (i >= sizeof(struct acpi_mcfg_allocation)) { entries++; i -= sizeof(struct acpi_mcfg_allocation); - }; + } if (entries == 0) { pr_err(PREFIX "MMCONFIG has no entries\n"); return -ENODEV; diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 6f2f8eeed171..3e6d2a6db866 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -62,11 +62,6 @@ out: return irq; } -void __init pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - int __init pci_visws_init(void) { pcibios_enable_irq = &pci_visws_enable_irq; diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 73b8be0f3675..6db1cc4c7534 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c new file mode 100644 index 000000000000..f6a0c1b8e518 --- /dev/null +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -0,0 +1,76 @@ +/* + * Copyright 2012 Intel Corporation + * Author: Josh Triplett <josh@joshtriplett.org> + * + * Based on the bgrt driver: + * Copyright 2012 Red Hat, Inc <mjg@redhat.com> + * Author: Matthew Garrett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/efi-bgrt.h> + +struct acpi_table_bgrt *bgrt_tab; +void *bgrt_image; +size_t bgrt_image_size; + +struct bmp_header { + u16 id; + u32 size; +} __packed; + +void efi_bgrt_init(void) +{ + acpi_status status; + void __iomem *image; + bool ioremapped = false; + struct bmp_header bmp_header; + + if (acpi_disabled) + return; + + status = acpi_get_table("BGRT", 0, + (struct acpi_table_header **)&bgrt_tab); + if (ACPI_FAILURE(status)) + return; + + if (bgrt_tab->version != 1) + return; + if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) + return; + + image = efi_lookup_mapped_addr(bgrt_tab->image_address); + if (!image) { + image = ioremap(bgrt_tab->image_address, sizeof(bmp_header)); + ioremapped = true; + if (!image) + return; + } + + memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); + if (ioremapped) + iounmap(image); + bgrt_image_size = bmp_header.size; + + bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); + if (!bgrt_image) + return; + + if (ioremapped) { + image = ioremap(bgrt_tab->image_address, bmp_header.size); + if (!image) { + kfree(bgrt_image); + bgrt_image = NULL; + return; + } + } + + memcpy_fromio(bgrt_image, image, bgrt_image_size); + if (ioremapped) + iounmap(image); +} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 92660edaa1e7..aded2a91162a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/efi.h> +#include <linux/efi-bgrt.h> #include <linux/export.h> #include <linux/bootmem.h> #include <linux/memblock.h> @@ -419,10 +420,21 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_free_boot_services(void) +static void __init efi_unmap_memmap(void) +{ + if (memmap.map) { + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; + } +} + +void __init efi_free_boot_services(void) { void *p; + if (!efi_native) + return; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { efi_memory_desc_t *md = p; unsigned long long start = md->phys_addr; @@ -438,6 +450,8 @@ static void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + + efi_unmap_memmap(); } static int __init efi_systab_init(void *phys) @@ -732,6 +746,11 @@ void __init efi_init(void) #endif } +void __init efi_late_init(void) +{ + efi_bgrt_init(); +} + void __init efi_set_executable(efi_memory_desc_t *md, bool executable) { u64 addr, npages; @@ -764,6 +783,34 @@ static void __init runtime_code_page_mkexec(void) } /* + * We can't ioremap data in EFI boot services RAM, because we've already mapped + * it as RAM. So, look it up in the existing EFI memory map instead. Only + * callable after efi_enter_virtual_mode and before efi_free_boot_services. + */ +void __iomem *efi_lookup_mapped_addr(u64 phys_addr) +{ + void *p; + if (WARN_ON(!memmap.map)) + return NULL; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + u64 size = md->num_pages << EFI_PAGE_SHIFT; + u64 end = md->phys_addr + size; + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + if (!md->virt_addr) + continue; + if (phys_addr >= md->phys_addr && phys_addr < end) { + phys_addr += md->virt_addr - md->phys_addr; + return (__force void __iomem *)(unsigned long)phys_addr; + } + } + return NULL; +} + +/* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that * has the runtime attribute bit set in its memory descriptor and update @@ -787,8 +834,10 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) - goto out; + if (!efi_native) { + efi_unmap_memmap(); + return; + } /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -878,18 +927,12 @@ void __init efi_enter_virtual_mode(void) } /* - * Thankfully, it does seem that no runtime services other than - * SetVirtualAddressMap() will touch boot services code, so we can - * get rid of it all at this point - */ - efi_free_boot_services(); - - /* * Now that EFI is in virtual mode, update the function * pointers in the runtime service table to the new virtual addresses. * * Call EFI services through wrapper functions. */ + efi.runtime_version = efi_systab.fw_revision; efi.get_time = virt_efi_get_time; efi.set_time = virt_efi_set_time; efi.get_wakeup_time = virt_efi_get_wakeup_time; @@ -906,9 +949,6 @@ void __init efi_enter_virtual_mode(void) if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); -out: - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; kfree(new_memmap); } diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 9317e0042f24..7dd86a419f5d 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -36,5 +36,7 @@ extern struct wakeup_header wakeup_header; /* Wakeup behavior bits */ #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 +#define WAKEUP_BEHAVIOR_RESTORE_CR4 1 +#define WAKEUP_BEHAVIOR_RESTORE_EFER 2 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 8905166b0bbb..e56479e58053 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -74,9 +74,18 @@ ENTRY(wakeup_start) lidtl wakeup_idt - /* Clear the EFLAGS */ - pushl $0 + /* Clear the EFLAGS but remember if we have EFLAGS.ID */ + movl $X86_EFLAGS_ID, %ecx + pushl %ecx popfl + pushfl + popl %edi + pushl $0 + popfl + pushfl + popl %edx + xorl %edx, %edi + andl %ecx, %edi /* %edi is zero iff CPUID & %cr4 are missing */ /* Check header signature... */ movl signature, %eax @@ -93,8 +102,8 @@ ENTRY(wakeup_start) /* Restore MISC_ENABLE before entering protected mode, in case BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + movl pmode_behavior, %edi + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %edi jnc 1f movl pmode_misc_en, %eax @@ -110,15 +119,15 @@ ENTRY(wakeup_start) movl pmode_cr3, %eax movl %eax, %cr3 - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 + btl $WAKEUP_BEHAVIOR_RESTORE_CR4, %edi + jz 1f + movl pmode_cr4, %eax + movl %eax, %cr4 1: + btl $WAKEUP_BEHAVIOR_RESTORE_EFER, %edi + jz 1f movl pmode_efer, %eax movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 3236aebc828d..f325af26107c 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -1,7 +1,9 @@ out := $(obj)/../include/generated/asm +uapi := $(obj)/../include/generated/uapi/asm # Create output directory if not already present -_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ + $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') syscall32 := $(srctree)/$(src)/syscall_32.tbl syscall64 := $(srctree)/$(src)/syscall_64.tbl @@ -18,7 +20,7 @@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ syshdr_abi_unistd_32 := i386 -$(out)/unistd_32.h: $(syscall32) $(syshdr) +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_32_ia32 := i386 @@ -28,11 +30,11 @@ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) syshdr_abi_unistd_x32 := common,x32 syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT -$(out)/unistd_x32.h: $(syscall64) $(syshdr) +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(out)/unistd_64.h: $(syscall64) $(syshdr) +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_64_x32 := x32 @@ -45,11 +47,12 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl) $(out)/syscalls_64.h: $(syscall64) $(systbl) $(call if_changed,systbl) -syshdr-y += unistd_32.h unistd_64.h unistd_x32.h +uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h -targets += $(syshdr-y) +targets += $(uapisyshdr-y) $(syshdr-y) -all: $(addprefix $(out)/,$(targets)) +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(out)/,$(syshdr-y)) diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 733057b435b0..bae601f900ef 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -28,7 +28,7 @@ posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity hostprogs-y += test_get_len insn_sanity # -I needed for generated C source and C source which in the kernel tree. -HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index ec57bd3818a4..7005ced5d1ad 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -6,8 +6,9 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> +#include "xen-ops.h" -unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { struct physdev_apic apic_op; int ret; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1fbe75a95f15..2d932c351f91 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -80,6 +80,8 @@ #include "smp.h" #include "multicalls.h" +#include <xen/events.h> + EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); @@ -1288,7 +1290,6 @@ asmlinkage void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; - pgd_t *pgd; if (!xen_start_info) return; @@ -1380,8 +1381,6 @@ asmlinkage void __init xen_start_kernel(void) acpi_numa = -1; #endif - pgd = (pgd_t *)xen_start_info->pt_base; - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1390,7 +1389,7 @@ asmlinkage void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); @@ -1441,11 +1440,19 @@ asmlinkage void __init xen_start_kernel(void) const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; + if (HYPERVISOR_dom0_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + xen_init_apic(); /* Make sure ACS will be enabled */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5141d808e751..5a16824cc2b3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,6 +84,7 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); +#ifdef CONFIG_X86_32 /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -91,7 +92,7 @@ DEFINE_SPINLOCK(xen_reservation_lock); */ #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); - +#endif #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -1174,9 +1175,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(pgd_t *base) -{ -} +static void xen_post_allocator_init(void); static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) { @@ -1192,14 +1191,87 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) } } -static void xen_post_allocator_init(void); +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); -static void __init xen_pagetable_setup_done(pgd_t *base) + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} +#endif +static void __init xen_pagetable_init(void) { +#ifdef CONFIG_X86_64 + unsigned long size; + unsigned long addr; +#endif + paging_init(); xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long new_mfn_list; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* On 32-bit, we get zero so this never gets executed. */ + new_mfn_list = xen_revector_p2m_tree(); + if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + } else + goto skip; + } + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superflous and is not neccessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +skip: +#endif xen_post_allocator_init(); } - static void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -1655,7 +1727,7 @@ static void set_page_prot(void *addr, pgprot_t prot) if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } - +#ifdef CONFIG_X86_32 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1706,7 +1778,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } - +#endif void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; @@ -1734,7 +1806,20 @@ static void convert_pfn_mfn(void *v) for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } - +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_end)--; + } +} /* * Set up the initial kernel pagetable. * @@ -1746,11 +1831,13 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; /* max_pfn_mapped is the last pfn mapped in the initial memory * mappings. Considering that on Xen after the kernel mappings we @@ -1758,32 +1845,53 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, * set max_pfn_mapped to the last real pfn mapped. */ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][511] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ + copy_page(level2_ident_pgt, l2); + /* Graft it onto L4[511][511] */ + copy_page(level2_kernel_pgt, l2); + + /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); + copy_page(level2_fixmap_pgt, l2); + /* Note that we don't do anything with level1_fixmap_pgt which + * we don't need. */ /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); @@ -1794,22 +1902,28 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Switch over */ - pgd = init_level4_pgt; - /* * At this stage there can be no user pgd, and no page * structure to attach it to, so make sure we just set kernel * pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); + __xen_write_cr3(true, __pa(init_level4_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); - return pgd; + /* Our (by three pages) smaller Xen pagetable that we are using */ + memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); @@ -1834,8 +1948,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) */ swapper_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - memcpy(swapper_kernel_pmd, initial_kernel_pmd, - sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); swapper_pg_dir[KERNEL_PGD_BOUNDARY] = __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); @@ -1852,8 +1965,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1865,11 +1977,11 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + copy_page(initial_page_table, pgd); initial_page_table[KERNEL_PGD_BOUNDARY] = __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); @@ -1885,8 +1997,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return initial_page_table; } #endif /* CONFIG_X86_64 */ @@ -2068,8 +2178,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; - x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2337,6 +2446,9 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EINVAL; + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == @@ -2355,8 +2467,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = -EFAULT; - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) goto out; nr -= batch; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 72213da605f5..95fb2aa5927e 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -22,7 +22,7 @@ * * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. + * 512 and 1024 entries respectively. * * In short, these structures contain the Machine Frame Number (MFN) of the PFN. * @@ -139,11 +139,11 @@ * / | ~0, ~0, .... | * | \---------------/ * | - * p2m_missing p2m_missing - * /------------------\ /------------\ - * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | - * | [p2m_mid_missing]+---->| ..., ~0 | - * \------------------/ \------------/ + * p2m_mid_missing p2m_missing + * /-----------------\ /------------\ + * | [p2m_missing] +---->| ~0, ~0, ~0 | + * | [p2m_missing] +---->| ..., ~0 | + * \-----------------/ \------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -396,7 +396,85 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } +#ifdef CONFIG_X86_64 +#include <linux/bootmem.h> +unsigned long __init xen_revector_p2m_tree(void) +{ + unsigned long va_start; + unsigned long va_end; + unsigned long pfn; + unsigned long pfn_free = 0; + unsigned long *mfn_list = NULL; + unsigned long size; + + va_start = xen_start_info->mfn_list; + /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), + * so make sure it is rounded up to that */ + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + va_end = va_start + size; + + /* If we were revectored already, don't do it again. */ + if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) + return 0; + + mfn_list = alloc_bootmem_align(size, PAGE_SIZE); + if (!mfn_list) { + pr_warn("Could not allocate space for a new P2M tree!\n"); + return xen_start_info->mfn_list; + } + /* Fill it out with INVALID_P2M_ENTRY value */ + memset(mfn_list, 0xFF, size); + + for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx; + unsigned long *mid_p; + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + mid_p = p2m_top[topidx][mididx]; + if (!mid_p) + continue; + if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) + continue; + + if ((unsigned long)mid_p == INVALID_P2M_ENTRY) + continue; + + /* The old va. Rebase it on mfn_list */ + if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { + unsigned long *new; + + if (pfn_free > (size / sizeof(unsigned long))) { + WARN(1, "Only allocated for %ld pages, but we want %ld!\n", + size / sizeof(unsigned long), pfn_free); + return 0; + } + new = &mfn_list[pfn_free]; + + copy_page(new, mid_p); + p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + + pfn_free += P2M_PER_PAGE; + } + /* This should be the leafs allocated for identity from _brk. */ + } + return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ + return 0; +} +#endif unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -430,7 +508,7 @@ static void free_p2m_page(void *p) free_page((unsigned long)p); } -/* +/* * Fully allocate the p2m structure for a given pfn. We need to check * that both the top and mid levels are allocated, and make sure the * parallel mfn tree is kept in sync. We may race with other cpus, so diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 967633ad98c4..969570491c39 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,6 +8,14 @@ #include <xen/xen.h> #include <asm/iommu_table.h> + +#include <asm/xen/swiotlb-xen.h> +#ifdef CONFIG_X86_64 +#include <asm/iommu.h> +#include <asm/dma.h> +#endif +#include <linux/export.h> + int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { @@ -34,34 +42,64 @@ static struct dma_map_ops xen_swiotlb_dma_ops = { int __init pci_xen_swiotlb_detect(void) { + if (!xen_pv_domain()) + return 0; + /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force) && - (xen_pv_domain())) + if ((xen_initial_domain() || swiotlb || swiotlb_force)) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. * Don't worry about swiotlb_force flag activating the native, as * the 'swiotlb' flag is the only one turning it on. */ - if (xen_pv_domain()) - swiotlb = 0; + swiotlb = 0; +#ifdef CONFIG_X86_64 + /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 + * (so no iommu=X command line over-writes). + * Considering that PV guests do not want the *native SWIOTLB* but + * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. + */ + if (max_pfn > MAX_DMA32_PFN) + no_iommu = 1; +#endif return xen_swiotlb; } void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { - xen_swiotlb_init(1); + xen_swiotlb_init(1, true /* early */); dma_ops = &xen_swiotlb_dma_ops; /* Make sure ACS will be enabled */ pci_request_acs(); } } + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(1, false /* late */); + if (rc) + return rc; + + dma_ops = &xen_swiotlb_dma_ops; + /* Make sure ACS will be enabled */ + pci_request_acs(); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); + IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - 0, + NULL, pci_xen_swiotlb_init, - 0); + NULL); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index ffcf2615640b..0a7852483ffe 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <xen/platform_pci.h> +#include "xen-ops.h" #define XEN_PLATFORM_ERR_MAGIC -1 #define XEN_PLATFORM_ERR_PROTOCOL -2 diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e2d62d697b5d..8971a26d21ab 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -432,6 +432,24 @@ char * __init xen_memory_setup(void) * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. */ memblock_reserve(__pa(xen_start_info->mfn_list), xen_start_info->pt_base - xen_start_info->mfn_list); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f58dca7a6e52..353c50f18702 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -377,7 +377,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) return rc; if (num_online_cpus() == 1) - alternatives_smp_switch(1); + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); rc = xen_smp_intr_init(cpu); if (rc) @@ -424,9 +425,6 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); - - if (num_online_cpus() == 1) - alternatives_smp_switch(0); } static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 1cd7f4d11e29..6722e3733f02 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -35,6 +35,7 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) info->u.text_mode_3.font_height; break; + case XEN_VGATYPE_EFI_LFB: case XEN_VGATYPE_VESA_LFB: if (size < offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps)) @@ -54,6 +55,12 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) screen_info->blue_pos = info->u.vesa_lfb.blue_pos; screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + + if (info->video_type == XEN_VGATYPE_EFI_LFB) { + screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; + break; + } + if (size >= offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps) + sizeof(info->u.vesa_lfb.gbl_caps)) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index aaa7291c9259..7faed5869e5b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,61 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE + .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE +#define NEXT_HYPERCALL(x) \ + ENTRY(xen_hypercall_##x) \ + .skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) + .skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) + .balign PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c150154..bb5a8105ea86 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -27,7 +27,7 @@ void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; @@ -45,6 +45,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); |