diff options
Diffstat (limited to 'arch/x86')
129 files changed, 1971 insertions, 1934 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dc12dddb40b2..a05571937ad3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -98,7 +98,6 @@ config X86 select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP @@ -1043,6 +1042,14 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. +config X86_MCELOG_LEGACY + bool "Support for deprecated /dev/mcelog character device" + depends on X86_MCE + ---help--- + Enable support for /dev/mcelog which is needed by the old mcelog + userspace logging daemon. Consider switching to the new generation + rasdaemon solution. + config X86_MCE_INTEL def_bool y prompt "Intel MCE features" @@ -1072,7 +1079,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d449337a360..49d160b781f0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -120,10 +120,6 @@ else # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) - - # this works around some issues with generating unwind tables in older gccs - # newer gccs do it by default - KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif ifdef CONFIG_X86_X32 @@ -147,6 +143,45 @@ ifeq ($(CONFIG_KMEMCHECK),y) KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) endif +# +# If the function graph tracer is used with mcount instead of fentry, +# '-maccumulate-outgoing-args' is needed to prevent a GCC bug +# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109) +# +ifdef CONFIG_FUNCTION_GRAPH_TRACER + ifndef CONFIG_HAVE_FENTRY + ACCUMULATE_OUTGOING_ARGS := 1 + else + ifeq ($(call cc-option-yn, -mfentry), n) + ACCUMULATE_OUTGOING_ARGS := 1 + + # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'. + # If '-Os' is enabled, disable it and print a warning. + ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + undefine CONFIG_CC_OPTIMIZE_FOR_SIZE + $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) + endif + + endif + endif +endif + +# +# Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a +# GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way +# to test for this bug at compile-time because the test case needs to execute, +# which is a no-go for cross compilers. So check the GCC version instead. +# +ifdef CONFIG_JUMP_LABEL + ifneq ($(ACCUMULATE_OUTGOING_ARGS), 1) + ACCUMULATE_OUTGOING_ARGS = $(call cc-if-fullversion, -lt, 040502, 1) + endif +endif + +ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) + KBUILD_CFLAGS += -maccumulate-outgoing-args +endif + # Stackpointer is addressed different for 32 bit and 64 bit x86 sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 6647ed49c66c..a45eb15b7cf2 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -45,24 +45,6 @@ cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) -# Work around the pentium-mmx code generator madness of gcc4.4.x which -# does stack alignment by generating horrible code _before_ the mcount -# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph -# tracer assumptions. For i686, generic, core2 this is set by the -# compiler anyway -ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y) -ADD_ACCUMULATE_OUTGOING_ARGS := y -endif - -# Work around to a bug with asm goto with first implementations of it -# in gcc causing gcc to mess up the push and pop of the stack in some -# uses of asm goto. -ifeq ($(CONFIG_JUMP_LABEL), y) -ADD_ACCUMULATE_OUTGOING_ARGS := y -endif - -cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args) - # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 8a9521b6a537..de45f57b410d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -178,6 +178,7 @@ CONFIG_E1000E=y CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y +CONFIG_R8169=y CONFIG_FDDI=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9ba050fe47f3..0af59fa789ea 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -390,3 +390,4 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx +384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 7853b53959cd..3f9d1a83891a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -30,8 +30,10 @@ static int __init vdso32_setup(char *s) { vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso32_enabled > 1) + if (vdso32_enabled > 1) { pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + vdso32_enabled = 0; + } return 1; } @@ -62,13 +64,18 @@ subsys_initcall(sysenter_setup); /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> +static const int zero; +static const int one = 1; + static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&zero, + .extra2 = (int *)&one, }, {} }; diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index afb222b63cae..c84584bb9402 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -604,7 +604,7 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, return &amd_f15_PMC20; } case AMD_EVENT_NB: - /* moved to perf_event_amd_uncore.c */ + /* moved to uncore.c */ return &emptyconstraint; default: return &emptyconstraint; diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b28200dea715..3641e24fdac5 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -11,6 +11,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) "perf/amd_iommu: " fmt + #include <linux/perf_event.h> #include <linux/init.h> #include <linux/cpumask.h> @@ -21,44 +23,42 @@ #define COUNTER_SHIFT 16 -#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) -#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) +/* iommu pmu conf masks */ +#define GET_CSOURCE(x) ((x)->conf & 0xFFULL) +#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) +#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL) +#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL) -/* iommu pmu config masks */ -#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) -#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) -#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) -#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) -#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) -#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) -#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) +/* iommu pmu conf1 masks */ +#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL) +#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) +#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -static struct perf_amd_iommu __perf_iommu; +#define IOMMU_NAME_SIZE 16 struct perf_amd_iommu { + struct list_head list; struct pmu pmu; + struct amd_iommu *iommu; + char name[IOMMU_NAME_SIZE]; u8 max_banks; u8 max_counters; u64 cntr_assign_mask; raw_spinlock_t lock; - const struct attribute_group *attr_groups[4]; }; -#define format_group attr_groups[0] -#define cpumask_group attr_groups[1] -#define events_group attr_groups[2] -#define null_group attr_groups[3] +static LIST_HEAD(perf_amd_iommu_list); /*--------------------------------------------- * sysfs format attributes *---------------------------------------------*/ PMU_FORMAT_ATTR(csource, "config:0-7"); PMU_FORMAT_ATTR(devid, "config:8-23"); -PMU_FORMAT_ATTR(pasid, "config:24-39"); -PMU_FORMAT_ATTR(domid, "config:40-55"); +PMU_FORMAT_ATTR(domid, "config:24-39"); +PMU_FORMAT_ATTR(pasid, "config:40-59"); PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); -PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); -PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); +PMU_FORMAT_ATTR(domid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(pasid_mask, "config1:32-51"); static struct attribute *iommu_format_attrs[] = { &format_attr_csource.attr, @@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = { /*--------------------------------------------- * sysfs events attributes *---------------------------------------------*/ +static struct attribute_group amd_iommu_events_group = { + .name = "events", +}; + struct amd_iommu_event_desc { struct kobj_attribute attr; const char *event; @@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = { /*---------------------------------------------*/ -static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) +static int get_next_avail_iommu_bnk_cntr(struct perf_event *event) { + struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu); + int max_cntrs = piommu->max_counters; + int max_banks = piommu->max_banks; + u32 shift, bank, cntr; unsigned long flags; - int shift, bank, cntr, retval; - int max_banks = perf_iommu->max_banks; - int max_cntrs = perf_iommu->max_counters; + int retval; - raw_spin_lock_irqsave(&perf_iommu->lock, flags); + raw_spin_lock_irqsave(&piommu->lock, flags); for (bank = 0, shift = 0; bank < max_banks; bank++) { for (cntr = 0; cntr < max_cntrs; cntr++) { shift = bank + (bank*3) + cntr; - if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { + if (piommu->cntr_assign_mask & BIT_ULL(shift)) { continue; } else { - perf_iommu->cntr_assign_mask |= (1ULL<<shift); - retval = ((u16)((u16)bank<<8) | (u8)(cntr)); + piommu->cntr_assign_mask |= BIT_ULL(shift); + event->hw.iommu_bank = bank; + event->hw.iommu_cntr = cntr; + retval = 0; goto out; } } } retval = -ENOSPC; out: - raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + raw_spin_unlock_irqrestore(&piommu->lock, flags); return retval; } @@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, static int perf_iommu_event_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct perf_amd_iommu *perf_iommu; - u64 config, config1; /* test the event attr type check for PMU enumeration */ if (event->attr.type != event->pmu->type) @@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - perf_iommu = &__perf_iommu; - - if (event->pmu != &perf_iommu->pmu) - return -ENOENT; - - if (perf_iommu) { - config = event->attr.config; - config1 = event->attr.config1; - } else { - return -EINVAL; - } - - /* integrate with iommu base devid (0000), assume one iommu */ - perf_iommu->max_banks = - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); - perf_iommu->max_counters = - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); - if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) - return -EINVAL; - /* update the hw_perf_event struct with the iommu config data */ - hwc->config = config; - hwc->extra_reg.config = config1; + hwc->conf = event->attr.config; + hwc->conf1 = event->attr.config1; return 0; } +static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev) +{ + return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu; +} + static void perf_iommu_enable_event(struct perf_event *ev) { - u8 csource = _GET_CSOURCE(ev); - u16 devid = _GET_DEVID(ev); + struct amd_iommu *iommu = perf_event_2_iommu(ev); + struct hw_perf_event *hwc = &ev->hw; + u8 bank = hwc->iommu_bank; + u8 cntr = hwc->iommu_cntr; u64 reg = 0ULL; - reg = csource; - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_COUNTER_SRC_REG, ®, true); + reg = GET_CSOURCE(hwc); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, ®); - reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); + reg = GET_DEVID_MASK(hwc); + reg = GET_DEVID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DEVID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, ®); - reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); + reg = GET_PASID_MASK(hwc); + reg = GET_PASID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_PASID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, ®); - reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); + reg = GET_DOMID_MASK(hwc); + reg = GET_DOMID(hwc) | (reg << 32); if (reg) - reg |= (1UL << 31); - amd_iommu_pc_get_set_reg_val(devid, - _GET_BANK(ev), _GET_CNTR(ev) , - IOMMU_PC_DOMID_MATCH_REG, ®, true); + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, ®); } static void perf_iommu_disable_event(struct perf_event *event) { + struct amd_iommu *iommu = perf_event_2_iommu(event); + struct hw_perf_event *hwc = &event->hw; u64 reg = 0ULL; - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_SRC_REG, ®, true); + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_SRC_REG, ®); } static void perf_iommu_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - pr_debug("perf: amd_iommu:perf_iommu_start\n"); if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags) hwc->state = 0; if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &prev_raw_count, true); + u64 prev_raw_count = local64_read(&hwc->prev_count); + struct amd_iommu *iommu = perf_event_2_iommu(event); + + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &prev_raw_count); } perf_iommu_enable_event(event); @@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags) static void perf_iommu_read(struct perf_event *event) { - u64 count = 0ULL; - u64 prev_raw_count = 0ULL; - u64 delta = 0ULL; + u64 count, prev, delta; struct hw_perf_event *hwc = &event->hw; - pr_debug("perf: amd_iommu:perf_iommu_read\n"); + struct amd_iommu *iommu = perf_event_2_iommu(event); - amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), - _GET_BANK(event), _GET_CNTR(event), - IOMMU_PC_COUNTER_REG, &count, false); + if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &count)) + return; /* IOMMU pc counter register is only 48 bits */ - count &= 0xFFFFFFFFFFFFULL; + count &= GENMASK_ULL(47, 0); - prev_raw_count = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - count) != prev_raw_count) + prev = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev) return; - /* Handling 48-bit counter overflowing */ - delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); + /* Handle 48-bit counter overflow */ + delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); delta >>= COUNTER_SHIFT; local64_add(delta, &event->count); - } static void perf_iommu_stop(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - u64 config; - - pr_debug("perf: amd_iommu:perf_iommu_stop\n"); if (hwc->state & PERF_HES_UPTODATE) return; @@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; - config = hwc->config; perf_iommu_read(event); hwc->state |= PERF_HES_UPTODATE; } @@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags) static int perf_iommu_add(struct perf_event *event, int flags) { int retval; - struct perf_amd_iommu *perf_iommu = - container_of(event->pmu, struct perf_amd_iommu, pmu); - pr_debug("perf: amd_iommu:perf_iommu_add\n"); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; /* request an iommu bank/counter */ - retval = get_next_avail_iommu_bnk_cntr(perf_iommu); - if (retval != -ENOSPC) - event->hw.extra_reg.reg = (u16)retval; - else + retval = get_next_avail_iommu_bnk_cntr(event); + if (retval) return retval; if (flags & PERF_EF_START) @@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags) static void perf_iommu_del(struct perf_event *event, int flags) { + struct hw_perf_event *hwc = &event->hw; struct perf_amd_iommu *perf_iommu = container_of(event->pmu, struct perf_amd_iommu, pmu); - pr_debug("perf: amd_iommu:perf_iommu_del\n"); perf_iommu_stop(event, PERF_EF_UPDATE); /* clear the assigned iommu bank/counter */ clear_avail_iommu_bnk_cntr(perf_iommu, - _GET_BANK(event), - _GET_CNTR(event)); + hwc->iommu_bank, hwc->iommu_cntr); perf_event_update_userpage(event); } -static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) +static __init int _init_events_attrs(void) { - struct attribute **attrs; - struct attribute_group *attr_group; int i = 0, j; + struct attribute **attrs; while (amd_iommu_v2_event_descs[i].attr.attr.name) i++; - attr_group = kzalloc(sizeof(struct attribute *) - * (i + 1) + sizeof(*attr_group), GFP_KERNEL); - if (!attr_group) + attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL); + if (!attrs) return -ENOMEM; - attrs = (struct attribute **)(attr_group + 1); for (j = 0; j < i; j++) attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; - attr_group->name = "events"; - attr_group->attrs = attrs; - perf_iommu->events_group = attr_group; - + amd_iommu_events_group.attrs = attrs; return 0; } -static __init void amd_iommu_pc_exit(void) -{ - if (__perf_iommu.events_group != NULL) { - kfree(__perf_iommu.events_group); - __perf_iommu.events_group = NULL; - } -} +const struct attribute_group *amd_iommu_attr_groups[] = { + &amd_iommu_format_group, + &amd_iommu_cpumask_group, + &amd_iommu_events_group, + NULL, +}; + +static struct pmu iommu_pmu = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_iommu_attr_groups, +}; -static __init int _init_perf_amd_iommu( - struct perf_amd_iommu *perf_iommu, char *name) +static __init int init_one_iommu(unsigned int idx) { + struct perf_amd_iommu *perf_iommu; int ret; + perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL); + if (!perf_iommu) + return -ENOMEM; + raw_spin_lock_init(&perf_iommu->lock); - /* Init format attributes */ - perf_iommu->format_group = &amd_iommu_format_group; + perf_iommu->pmu = iommu_pmu; + perf_iommu->iommu = get_amd_iommu(idx); + perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx); + perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx); - /* Init cpumask attributes to only core 0 */ - cpumask_set_cpu(0, &iommu_cpumask); - perf_iommu->cpumask_group = &amd_iommu_cpumask_group; - - /* Init events attributes */ - if (_init_events_attrs(perf_iommu) != 0) - pr_err("perf: amd_iommu: Only support raw events.\n"); + if (!perf_iommu->iommu || + !perf_iommu->max_banks || + !perf_iommu->max_counters) { + kfree(perf_iommu); + return -EINVAL; + } - /* Init null attributes */ - perf_iommu->null_group = NULL; - perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; + snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx); - ret = perf_pmu_register(&perf_iommu->pmu, name, -1); - if (ret) { - pr_err("perf: amd_iommu: Failed to initialized.\n"); - amd_iommu_pc_exit(); + ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1); + if (!ret) { + pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n", + idx, perf_iommu->max_banks, perf_iommu->max_counters); + list_add_tail(&perf_iommu->list, &perf_amd_iommu_list); } else { - pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", - amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), - amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); + pr_warn("Error initializing IOMMU %d.\n", idx); + kfree(perf_iommu); } - return ret; } -static struct perf_amd_iommu __perf_iommu = { - .pmu = { - .task_ctx_nr = perf_invalid_context, - .event_init = perf_iommu_event_init, - .add = perf_iommu_add, - .del = perf_iommu_del, - .start = perf_iommu_start, - .stop = perf_iommu_stop, - .read = perf_iommu_read, - }, - .max_banks = 0x00, - .max_counters = 0x00, - .cntr_assign_mask = 0ULL, - .format_group = NULL, - .cpumask_group = NULL, - .events_group = NULL, - .null_group = NULL, -}; - static __init int amd_iommu_pc_init(void) { + unsigned int i, cnt = 0; + int ret; + /* Make sure the IOMMU PC resource is available */ if (!amd_iommu_pc_supported()) return -ENODEV; - _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); + ret = _init_events_attrs(); + if (ret) + return ret; + + /* + * An IOMMU PMU is specific to an IOMMU, and can function independently. + * So we go through all IOMMUs and ignore the one that fails init + * unless all IOMMU are failing. + */ + for (i = 0; i < amd_iommu_get_num_iommus(); i++) { + ret = init_one_iommu(i); + if (!ret) + cnt++; + } + + if (!cnt) { + kfree(amd_iommu_events_group.attrs); + return -ENODEV; + } + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); return 0; } diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 845d173278e3..62e0702c4374 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -24,17 +24,23 @@ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 -/* iommu pc reg masks*/ -#define IOMMU_BASE_DEVID 0x0000 +struct amd_iommu; /* amd_iommu_init.c external support functions */ +extern int amd_iommu_get_num_iommus(void); + extern bool amd_iommu_pc_supported(void); -extern u8 amd_iommu_pc_get_max_banks(u16 devid); +extern u8 amd_iommu_pc_get_max_banks(unsigned int idx); + +extern u8 amd_iommu_pc_get_max_counters(unsigned int idx); + +extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, + u8 fxn, u64 *value); -extern u8 amd_iommu_pc_get_max_counters(u16 devid); +extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, + u8 fxn, u64 *value); -extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, - u8 fxn, u64 *value, bool is_write); +extern struct amd_iommu *get_amd_iommu(int idx); #endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 4d1f7f2d9aff..ad44af0dd667 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -30,6 +30,9 @@ #define COUNTER_SHIFT 16 +#undef pr_fmt +#define pr_fmt(fmt) "amd_uncore: " fmt + static int num_counters_llc; static int num_counters_nb; @@ -509,51 +512,34 @@ static int __init amd_uncore_init(void) int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto fail_nodev; - - switch(boot_cpu_data.x86) { - case 23: - /* Family 17h: */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L3; - /* - * For Family17h, the NorthBridge counters are - * re-purposed as Data Fabric counters. Also, support is - * added for L3 counters. The pmus are exported based on - * family as either L2 or L3 and NB or DF. - */ - amd_nb_pmu.name = "amd_df"; - amd_llc_pmu.name = "amd_l3"; - format_attr_event_df.show = &event_show_df; - format_attr_event_l3.show = &event_show_l3; - break; - case 22: - /* Family 16h - may change: */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - break; - default: - /* - * All prior families have the same number of - * NorthBridge and Last Level Cache counters - */ - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - break; - } - amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; - amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) - goto fail_nodev; + return -ENODEV; + + if (boot_cpu_data.x86 == 0x17) { + /* + * For F17h, the Northbridge counters are repurposed as Data + * Fabric counters. Also, L3 counters are supported too. The PMUs + * are exported based on family as either L2 or L3 and NB or DF. + */ + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L3; + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + format_attr_event_df.show = &event_show_df; + format_attr_event_l3.show = &event_show_l3; + } else { + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + amd_nb_pmu.name = "amd_nb"; + amd_llc_pmu.name = "amd_l2"; + format_attr_event_df = format_attr_event; + format_attr_event_l3 = format_attr_event; + } + + amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; + amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); @@ -565,7 +551,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("perf: AMD NB counters detected\n"); + pr_info("AMD NB counters detected\n"); ret = 0; } @@ -579,7 +565,7 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("perf: AMD LLC counters detected\n"); + pr_info("AMD LLC counters detected\n"); ret = 0; } @@ -615,7 +601,6 @@ fail_nb: if (amd_uncore_nb) free_percpu(amd_uncore_nb); -fail_nodev: return ret; } device_initcall(amd_uncore_init); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 349d4d17aa7f..580b60f5ac83 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2101,8 +2101,8 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->mm) - load_mm_cr4(current->mm); + if (current->active_mm) + load_mm_cr4(current->active_mm); } static void x86_pmu_event_mapped(struct perf_event *event) @@ -2110,6 +2110,18 @@ static void x86_pmu_event_mapped(struct perf_event *event) if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; + /* + * This function relies on not being called concurrently in two + * tasks in the same mm. Otherwise one task could observe + * perf_rdpmc_allowed > 1 and return all the way back to + * userspace with CR4.PCE clear while another task is still + * doing on_each_cpu_mask() to propagate CR4.PCE. + * + * For now, this can't happen because all callers hold mmap_sem + * for write. If this changes, we'll need a different solution. + */ + lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); } @@ -2244,6 +2256,7 @@ void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data *data; + u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; @@ -2251,11 +2264,13 @@ void arch_perf_update_userpage(struct perf_event *event, !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable()) + if (!using_native_sched_clock() || !sched_clock_stable()) return; data = cyc2ns_read_begin(); + offset = data->cyc2ns_offset + __sched_clock_offset; + /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. @@ -2263,7 +2278,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; - userpg->time_offset = data->cyc2ns_offset - now; + userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different @@ -2271,7 +2286,7 @@ void arch_perf_update_userpage(struct perf_event *event, */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + userpg->time_zero = offset; } cyc2ns_read_end(data); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 982c9e31daca..8ae8c5ce3a1f 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -63,7 +63,6 @@ struct bts_buffer { unsigned int cur_buf; bool snapshot; local_t data_size; - local_t lost; local_t head; unsigned long end; void **data_pages; @@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts) return; if (ds->bts_index >= ds->bts_absolute_maximum) - local_inc(&buf->lost); + perf_aux_output_flag(&bts->handle, + PERF_AUX_FLAG_TRUNCATED); /* * old and head are always in the same physical buffer, so we @@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags) return; fail_end_stop: - perf_aux_output_end(&bts->handle, 0, false); + perf_aux_output_end(&bts->handle, 0); fail_stop: event->hw.state = PERF_HES_STOPPED; @@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); - - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); + perf_aux_output_end(&bts->handle, + local_xchg(&buf->data_size, 0)); } cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; @@ -484,8 +483,7 @@ int intel_bts_interrupt(void) if (old_head == local_read(&buf->head)) return handled; - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0)); buf = perf_aux_output_begin(&bts->handle, event); if (buf) @@ -500,7 +498,7 @@ int intel_bts_interrupt(void) * cleared handle::event */ barrier(); - perf_aux_output_end(&bts->handle, 0, false); + perf_aux_output_end(&bts->handle, 0); } } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index eb1484c86bb4..a6d91d4e37a1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c"); +EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3"); +/* UOPS_NOT_DELIVERED.ANY */ +EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c"); +/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */ +EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02"); +/* UOPS_RETIRED.ANY */ +EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2"); +/* UOPS_ISSUED.ANY */ +EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e"); + +static struct attribute *glm_events_attrs[] = { + EVENT_PTR(td_total_slots_glm), + EVENT_PTR(td_total_slots_scale_glm), + EVENT_PTR(td_fetch_bubbles_glm), + EVENT_PTR(td_recovery_bubbles_glm), + EVENT_PTR(td_slots_issued_glm), + EVENT_PTR(td_slots_retired_glm), + NULL +}; + static struct extra_reg intel_glm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0), @@ -2130,7 +2151,7 @@ again: * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - status &= ~cpuc->pebs_enabled; + status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); /* * PEBS overflow sets bit 62 in the global status register @@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.cpu_events = glm_events_attrs; pr_cont("Goldmont events, "); break; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index aff4b5b69d40..238ae3248ba5 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -1,5 +1,5 @@ /* - * perf_event_intel_cstate.c: support cstate residency counters + * Support cstate residency counters * * Copyright (C) 2015, Intel Corp. * Author: Kan Liang (kan.liang@intel.com) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9dfeeeca0ea8..c6d23ffe422d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1222,7 +1222,7 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) /* clear non-PEBS bit and re-check */ pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 81b321ace8e0..f924629836a8 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].to = msr_lastbranch.to; cpuc->lbr_entries[i].mispred = 0; cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].in_tx = 0; + cpuc->lbr_entries[i].abort = 0; + cpuc->lbr_entries[i].cycles = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 5900471ee508..ae8324d65e61 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/intel_pt.h> +#include <asm/intel-family.h> #include "../perf_event.h" #include "pt.h" @@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = { .name = "caps", }; +PMU_FORMAT_ATTR(pt, "config:0" ); PMU_FORMAT_ATTR(cyc, "config:1" ); PMU_FORMAT_ATTR(pwr_evt, "config:4" ); PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); @@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); PMU_FORMAT_ATTR(ptw, "config:12" ); +PMU_FORMAT_ATTR(branch, "config:13" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { + &format_attr_pt.attr, &format_attr_cyc.attr, &format_attr_pwr_evt.attr, &format_attr_fup_on_ptw.attr, @@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = { &format_attr_tsc.attr, &format_attr_noretcomp.attr, &format_attr_ptw.attr, + &format_attr_branch.attr, &format_attr_mtc_period.attr, &format_attr_cyc_thresh.attr, &format_attr_psb_period.attr, @@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void) pt_pmu.tsc_art_den = eax; } + /* model-specific quirks */ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_CORE: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL_X: + /* not setting BRANCH_EN will #GP, erratum BDM106 */ + pt_pmu.branch_en_always_on = true; + break; + default: + break; + } + if (boot_cpu_has(X86_FEATURE_VMX)) { /* * Intel SDM, 36.5 "Tracing post-VMXON" says that @@ -263,8 +281,20 @@ fail: #define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ RTIT_CTL_FUP_ON_PTW) -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ +/* + * Bit 0 (TraceEn) in the attr.config is meaningless as the + * corresponding bit in the RTIT_CTL can only be controlled + * by the driver; therefore, repurpose it to mean: pass + * through the bit that was previously assumed to be always + * on for PT, thereby allowing the user to *not* set it if + * they so wish. See also pt_event_valid() and pt_config(). + */ +#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN + +#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \ + RTIT_CTL_TSC_EN | \ RTIT_CTL_DISRETC | \ + RTIT_CTL_BRANCH_EN | \ RTIT_CTL_CYC_PSB | \ RTIT_CTL_MTC | \ RTIT_CTL_PWR_EVT_EN | \ @@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event) return false; } + /* + * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config + * clears the assomption that BranchEn must always be enabled, + * as was the case with the first implementation of PT. + * If this bit is not set, the legacy behavior is preserved + * for compatibility with the older userspace. + * + * Re-using bit 0 for this purpose is fine because it is never + * directly set by the user; previous attempts at setting it in + * the attr.config resulted in -EINVAL. + */ + if (config & RTIT_CTL_PASSTHROUGH) { + /* + * Disallow not setting BRANCH_EN where BRANCH_EN is + * always required. + */ + if (pt_pmu.branch_en_always_on && + !(config & RTIT_CTL_BRANCH_EN)) + return false; + } else { + /* + * Disallow BRANCH_EN without the PASSTHROUGH. + */ + if (config & RTIT_CTL_BRANCH_EN) + return false; + } + return true; } @@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event) static void pt_config(struct perf_event *event) { + struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; if (!event->hw.itrace_started) { @@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event) } reg = pt_config_filters(event); - reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN; + + /* + * Previously, we had BRANCH_EN on by default, but now that PT has + * grown features outside of branch tracing, it is useful to allow + * the user to disable it. Setting bit 0 in the event's attr.config + * allows BRANCH_EN to pass through instead of being always on. See + * also the comment in pt_event_valid(). + */ + if (event->attr.config & BIT(0)) { + reg |= event->attr.config & RTIT_CTL_BRANCH_EN; + } else { + reg |= RTIT_CTL_BRANCH_EN; + } if (!event->attr.exclude_kernel) reg |= RTIT_CTL_OS; @@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK); event->hw.config = reg; - wrmsrl(MSR_IA32_RTIT_CTL, reg); + if (READ_ONCE(pt->vmx_on)) + perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); + else + wrmsrl(MSR_IA32_RTIT_CTL, reg); } static void pt_config_stop(struct perf_event *event) { + struct pt *pt = this_cpu_ptr(&pt_ctx); u64 ctl = READ_ONCE(event->hw.config); /* may be already stopped by a PMI */ @@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event) return; ctl &= ~RTIT_CTL_TRACEEN; - wrmsrl(MSR_IA32_RTIT_CTL, ctl); + if (!READ_ONCE(pt->vmx_on)) + wrmsrl(MSR_IA32_RTIT_CTL, ctl); WRITE_ONCE(event->hw.config, ctl); @@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt) */ if (!pt_cap_get(PT_CAP_topa_multiple_entries) || buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { - local_inc(&buf->lost); + perf_aux_output_flag(&pt->handle, + PERF_AUX_FLAG_TRUNCATED); advance++; } } @@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < - sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) + sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); return -EINVAL; + } /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ @@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void) if (!READ_ONCE(pt->handle_nmi)) return; - /* - * If VMX is on and PT does not support it, don't touch anything. - */ - if (READ_ONCE(pt->vmx_on)) - return; - if (!event) return; @@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void) pt_update_head(pt); - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); if (!event->hw.state) { int ret; @@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void) /* snapshot counters don't use PMI, so it's safe */ ret = pt_buffer_reset_markers(buf, &pt->handle); if (ret) { - perf_aux_output_end(&pt->handle, 0, true); + perf_aux_output_end(&pt->handle, 0); return; } @@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on) local_irq_save(flags); WRITE_ONCE(pt->vmx_on, on); - if (on) { - /* prevent pt_config_stop() from writing RTIT_CTL */ - event = pt->handle.event; - if (event) - event->hw.config = 0; - } + /* + * If an AUX transaction is in progress, it will contain + * gap(s), so flag it PARTIAL to inform the user. + */ + event = pt->handle.event; + if (event) + perf_aux_output_flag(&pt->handle, + PERF_AUX_FLAG_PARTIAL); + + /* Turn PTs back on */ + if (!on && event) + wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); @@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf; - if (READ_ONCE(pt->vmx_on)) - return; - buf = perf_aux_output_begin(&pt->handle, event); if (!buf) goto fail_stop; @@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode) return; fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); + perf_aux_output_end(&pt->handle, 0); fail_stop: hwc->state = PERF_HES_STOPPED; } @@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode) pt->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0), - local_xchg(&buf->lost, 0)); + perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); } } diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 53473c21b554..0eb41d07b79a 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -110,6 +110,7 @@ struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; bool vmx; + bool branch_en_always_on; unsigned long max_nonturbo_ratio; unsigned int tsc_art_num; unsigned int tsc_art_den; @@ -143,7 +144,6 @@ struct pt_buffer { size_t output_off; unsigned long nr_pages; local_t data_size; - local_t lost; local64_t head; bool snapshot; unsigned long stop_pos, intr_pos; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 22054ca49026..9d05c7e67f60 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -1,5 +1,5 @@ /* - * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Support Intel RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index ad986c1e29bc..df5989f27b1b 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -360,7 +360,7 @@ extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; -/* perf_event_intel_uncore_snb.c */ +/* uncore_snb.c */ int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); @@ -371,7 +371,7 @@ void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); -/* perf_event_intel_uncore_snbep.c */ +/* uncore_snbep.c */ int snbep_uncore_pci_init(void); void snbep_uncore_cpu_init(void); int ivbep_uncore_pci_init(void); @@ -385,5 +385,5 @@ void knl_uncore_cpu_init(void); int skx_uncore_pci_init(void); void skx_uncore_cpu_init(void); -/* perf_event_intel_uncore_nhmex.c */ +/* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index bcbb1d2ae10b..be3d36254040 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -79,6 +79,7 @@ struct amd_nb { /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* * Flags PEBS can handle without an PMI. diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index db64baf0e500..8bef70e7f3cc 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -158,13 +158,13 @@ void hyperv_init(void) clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); return; } +register_msr_cs: #endif /* * For 32 bit guests just use the MSR based mechanism for reading * the partition counter. */ -register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 14635c5ea025..caa5798c92f4 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) return cmpxchg(&v->counter, old, new); } +#define atomic_try_cmpxchg atomic_try_cmpxchg +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline int atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); @@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v) \ } #define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ +static inline int atomic_fetch_##op(int i, atomic_t *v) \ { \ - int old, val = atomic_read(v); \ - for (;;) { \ - old = atomic_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + int val = atomic_read(v); \ + do { \ + } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC_OPS(op, c_op) \ @@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^) */ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) + int c = atomic_read(v); + do { + if (unlikely(c == u)) break; - c = old; - } + } while (!atomic_try_cmpxchg(v, &c, c + a)); return c; } diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 89ed2f6ae2f7..6189a433c9a9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) return cmpxchg(&v->counter, old, new); } +#define atomic64_try_cmpxchg atomic64_try_cmpxchg +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline long atomic64_xchg(atomic64_t *v, long new) { return xchg(&v->counter, new); @@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c, old; - c = atomic64_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic64_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); + long c = atomic64_read(v); + do { + if (unlikely(c == u)) + return false; + } while (!atomic64_try_cmpxchg(v, &c, c + a)); + return true; } #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) @@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long c, old, dec; - c = atomic64_read(v); - for (;;) { + long dec, c = atomic64_read(v); + do { dec = c - 1; if (unlikely(dec < 0)) break; - old = atomic64_cmpxchg((v), c, dec); - if (likely(old == c)) - break; - c = old; - } + } while (!atomic64_try_cmpxchg(v, &c, dec)); return dec; } @@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v) \ #define ATOMIC64_FETCH_OP(op, c_op) \ static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - long old, val = atomic64_read(v); \ - for (;;) { \ - old = atomic64_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + long val = atomic64_read(v); \ + do { \ + } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC64_OPS(op, c_op) \ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 97848cdfcb1a..d90296d061e8 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -153,6 +153,76 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) + +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ +({ \ + bool success; \ + __typeof__(_ptr) _old = (_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "q" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); \ +}) + +#define __try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) + +#define try_cmpxchg(ptr, pold, new) \ + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4e7772387c6e..0fe00446f9ca 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -187,6 +187,7 @@ * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ @@ -289,7 +290,8 @@ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ +#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9d49c18b5ea9..3762536619f8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -287,7 +287,7 @@ struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ - if (vdso32_enabled) { \ + if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 200581691c6e..34b984c60790 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -72,14 +72,13 @@ struct arch_specific_insn { /* copy of the original instruction */ kprobe_opcode_t *insn; /* - * boostable = -1: This instruction type is not boostable. - * boostable = 0: This instruction type is boostable. - * boostable = 1: This instruction has been boosted: we have + * boostable = false: This instruction type is not boostable. + * boostable = true: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's * a post_handler or break_handler). */ - int boostable; + bool boostable; bool if_modifier; }; diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index d74747b031ec..c4eda791f877 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -46,6 +46,7 @@ struct kvm_page_track_notifier_node { }; void kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index e63873683d4a..4fd5195deed0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -128,7 +128,7 @@ * debugging tools. Each entry is only valid when its finished flag * is set. */ -struct mce_log { +struct mce_log_buffer { char signature[12]; /* "MACHINECHECK" */ unsigned len; /* = MCE_LOG_LEN */ unsigned next; @@ -191,10 +191,12 @@ extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { - MCE_PRIO_SRAO = INT_MAX, - MCE_PRIO_EXTLOG = INT_MAX - 1, - MCE_PRIO_NFIT = INT_MAX - 2, - MCE_PRIO_EDAC = INT_MAX - 3, + MCE_PRIO_FIRST = INT_MAX, + MCE_PRIO_SRAO = INT_MAX - 1, + MCE_PRIO_EXTLOG = INT_MAX - 2, + MCE_PRIO_NFIT = INT_MAX - 3, + MCE_PRIO_EDAC = INT_MAX - 4, + MCE_PRIO_MCELOG = 1, MCE_PRIO_LOWEST = 0, }; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8b5f8ab8ef9..673f9ac50f6d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -45,6 +45,8 @@ #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd #define MSR_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 +#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -127,6 +129,7 @@ /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) @@ -552,10 +555,12 @@ #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) -/* MISC_FEATURE_ENABLES non-architectural features */ -#define MSR_MISC_FEATURE_ENABLES 0x00000140 +/* MISC_FEATURES_ENABLES non-architectural features */ +#define MSR_MISC_FEATURES_ENABLES 0x00000140 -#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT 1 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT) +#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1 #define MSR_IA32_TSC_DEADLINE 0x000006E0 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,12 +121,9 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } -#if !defined(CONFIG_SMP) || (defined(CONFIG_HIGHMEM64G) && \ - defined(CONFIG_PARAVIRT)) static inline void native_pud_clear(pud_t *pudp) { } -#endif static inline void pud_clear(pud_t *pudp) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index eef5fbc7c674..2197e5322df9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -60,7 +60,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif -#ifndef __PAGETABLE_PMD_FOLDED +#ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8b4de22d6429..62484333673d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,6 +273,8 @@ static inline pgdval_t pgd_flags(pgd_t pgd) } #if CONFIG_PGTABLE_LEVELS > 3 +#include <asm-generic/5level-fixup.h> + typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -285,6 +287,7 @@ static inline pudval_t native_pud_val(pud_t pud) return pud.pud; } #else +#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> static inline pudval_t native_pud_val(pud_t pud) @@ -306,6 +309,7 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) return pmd.pmd; } #else +#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> static inline pmdval_t native_pmd_val(pmd_t pmd) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 34684adb6899..b3b09b98896d 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -46,6 +46,15 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc(). pkey 0 is special, and never + * returned from pkey_alloc(). + */ + if (pkey <= 0) + return false; + if (pkey >= arch_max_pkey()) + return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } @@ -82,12 +91,6 @@ int mm_pkey_alloc(struct mm_struct *mm) static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { - /* - * pkey 0 is special, always allocated and can never - * be freed. - */ - if (!pkey) - return -EINVAL; if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 2c1ebeb4d737..529bb4a6487a 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void arch_wb_cache_pmem(void *addr, size_t size) { @@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(addr, bytes, i); - if (__iter_needs_pmem_wb(i)) + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + arch_wb_cache_pmem(addr, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else arch_wb_cache_pmem(addr, bytes); return len; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f385eca5407a..a80c1b3997ed 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -884,6 +884,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +DECLARE_PER_CPU(u64, msr_misc_features_shadow); + /* Register/unregister a process' MPX related resource */ #define MPX_ENABLE_MANAGEMENT() mpx_enable_management() #define MPX_DISABLE_MANAGEMENT() mpx_disable_management() diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 9b9b30b19441..8d3964fc5f91 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -9,6 +9,7 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -30,6 +31,7 @@ void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h new file mode 100644 index 000000000000..d7da2729903d --- /dev/null +++ b/arch/x86/include/asm/purgatory.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_PURGATORY_H +#define _ASM_X86_PURGATORY_H + +#ifndef __ASSEMBLY__ +#include <linux/purgatory.h> + +extern void purgatory(void); +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern unsigned long purgatory_backup_dest; +extern unsigned long purgatory_backup_src; +extern unsigned long purgatory_backup_sz; +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 2cb1cc253d51..fc62ba8dce93 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -15,6 +15,7 @@ struct machine_ops { }; extern struct machine_ops machine_ops; +extern int crashing_cpu; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6f5eb07a95..9fc44b95f7cb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,6 +87,7 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ +#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_NOHZ (1 << TIF_NOHZ) @@ -138,7 +140,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) @@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void arch_task_cache_init(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void arch_release_task_struct(struct task_struct *tsk); +extern void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index a04eabd43d06..27e9f9d769b8 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -12,6 +12,8 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; +extern bool using_native_sched_clock(void); + /* * We use the full linear equation: f(x) = a + b*x, in order to allow * a continuous function in the face of dynamic freq changes. diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fa85944af83..75d002bdb3f3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask) } } +static inline void cr4_toggle_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Read the CR4 shadow. */ static inline unsigned long cr4_read_shadow(void) { @@ -188,7 +198,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (static_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ea148313570f..68766b276d9e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -3,19 +3,14 @@ /* * User space memory access functions */ -#include <linux/errno.h> #include <linux/compiler.h> #include <linux/kasan-checks.h> -#include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - /* * The fs value determines whether argument validity checking should be * performed or not. If get_fs() == USER_DS, checking is performed, with @@ -384,6 +379,18 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \ + asm volatile("\n" \ + "1: mov"itype" %2,%"rtype"1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: mov %3,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : "=r" (err), ltype(x) \ + : "m" (__m(addr)), "i" (errret), "0" (err)) + /* * This doesn't do __uaccess_begin/end - the exception handling * around it must do that. @@ -675,59 +682,6 @@ extern struct movsl_mask { # include <asm/uaccess_64.h> #endif -unsigned long __must_check _copy_from_user(void *to, const void __user *from, - unsigned n); -unsigned long __must_check _copy_to_user(void __user *to, const void *from, - unsigned n); - -extern void __compiletime_error("usercopy buffer size is too small") -__bad_copy_user(void); - -static inline void copy_user_overflow(int size, unsigned long count) -{ - WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); -} - -static __always_inline unsigned long __must_check -copy_from_user(void *to, const void __user *from, unsigned long n) -{ - int sz = __compiletime_object_size(to); - - might_fault(); - - kasan_check_write(to, n); - - if (likely(sz < 0 || sz >= n)) { - check_object_size(to, n, false); - n = _copy_from_user(to, from, n); - } else if (!__builtin_constant_p(n)) - copy_user_overflow(sz, n); - else - __bad_copy_user(); - - return n; -} - -static __always_inline unsigned long __must_check -copy_to_user(void __user *to, const void *from, unsigned long n) -{ - int sz = __compiletime_object_size(from); - - kasan_check_read(from, n); - - might_fault(); - - if (likely(sz < 0 || sz >= n)) { - check_object_size(from, n, true); - n = _copy_to_user(to, from, n); - } else if (!__builtin_constant_p(n)) - copy_user_overflow(sz, n); - else - __bad_copy_user(); - - return n; -} - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 7d3bdd1ed697..aeda9bb8af50 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -4,149 +4,52 @@ /* * User space memory access functions */ -#include <linux/errno.h> -#include <linux/thread_info.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> -unsigned long __must_check __copy_to_user_ll - (void __user *to, const void *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll - (void *to, const void __user *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll_nozero - (void *to, const void __user *from, unsigned long n); -unsigned long __must_check __copy_from_user_ll_nocache - (void *to, const void __user *from, unsigned long n); +unsigned long __must_check __copy_user_ll + (void *to, const void *from, unsigned long n); unsigned long __must_check __copy_from_user_ll_nocache_nozero (void *to, const void __user *from, unsigned long n); -/** - * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. - * - * Copy data from kernel space to user space. Caller must check - * the specified block with access_ok() before calling this function. - * The caller should also make sure he pins the user space address - * so that we don't result in page fault and sleep. - */ -static __always_inline unsigned long __must_check -__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) -{ - check_object_size(from, n, true); - return __copy_to_user_ll(to, from, n); -} - -/** - * __copy_to_user: - Copy a block of data into user space, with less checking. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ static __always_inline unsigned long __must_check -__copy_to_user(void __user *to, const void *from, unsigned long n) +raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - might_fault(); - return __copy_to_user_inatomic(to, from, n); + return __copy_user_ll((__force void *)to, from, n); } static __always_inline unsigned long -__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) -{ - return __copy_from_user_ll_nozero(to, from, n); -} - -/** - * __copy_from_user: - Copy a block of data from user space, with less checking. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - * - * An alternate version - __copy_from_user_inatomic() - may be called from - * atomic context and will fail rather than sleep. In this case the - * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h - * for explanation of why this is needed. - */ -static __always_inline unsigned long -__copy_from_user(void *to, const void __user *from, unsigned long n) -{ - might_fault(); - check_object_size(to, n, false); - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } - return __copy_from_user_ll(to, from, n); -} - -static __always_inline unsigned long __copy_from_user_nocache(void *to, - const void __user *from, unsigned long n) +raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; switch (n) { case 1: + ret = 0; __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); + __get_user_asm_nozero(*(u8 *)to, from, ret, + "b", "b", "=q", 1); __uaccess_end(); return ret; case 2: + ret = 0; __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); + __get_user_asm_nozero(*(u16 *)to, from, ret, + "w", "w", "=r", 2); __uaccess_end(); return ret; case 4: + ret = 0; __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); + __get_user_asm_nozero(*(u32 *)to, from, ret, + "l", "k", "=r", 4); __uaccess_end(); return ret; } } - return __copy_from_user_ll_nocache(to, from, n); + return __copy_user_ll(to, (__force const void *)from, n); } static __always_inline unsigned long diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 673059a109fe..c5504b9a472e 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -5,7 +5,6 @@ * User space memory access functions */ #include <linux/compiler.h> -#include <linux/errno.h> #include <linux/lockdep.h> #include <linux/kasan-checks.h> #include <asm/alternative.h> @@ -46,58 +45,54 @@ copy_user_generic(void *to, const void *from, unsigned len) return ret; } -__must_check unsigned long -copy_in_user(void __user *to, const void __user *from, unsigned len); - -static __always_inline __must_check -int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) +static __always_inline __must_check unsigned long +raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { int ret = 0; - check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { case 1: __uaccess_begin(); - __get_user_asm(*(u8 *)dst, (u8 __user *)src, + __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); __uaccess_end(); return ret; case 2: __uaccess_begin(); - __get_user_asm(*(u16 *)dst, (u16 __user *)src, + __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); __uaccess_end(); return ret; case 4: __uaccess_begin(); - __get_user_asm(*(u32 *)dst, (u32 __user *)src, + __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); __uaccess_end(); return ret; case 8: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); __uaccess_end(); return ret; case 10: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); if (likely(!ret)) - __get_user_asm(*(u16 *)(8 + (char *)dst), + __get_user_asm_nozero(*(u16 *)(8 + (char *)dst), (u16 __user *)(8 + (char __user *)src), ret, "w", "w", "=r", 2); __uaccess_end(); return ret; case 16: __uaccess_begin(); - __get_user_asm(*(u64 *)dst, (u64 __user *)src, + __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); if (likely(!ret)) - __get_user_asm(*(u64 *)(8 + (char *)dst), + __get_user_asm_nozero(*(u64 *)(8 + (char *)dst), (u64 __user *)(8 + (char __user *)src), ret, "q", "", "=r", 8); __uaccess_end(); @@ -107,20 +102,11 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) } } -static __always_inline __must_check -int __copy_from_user(void *dst, const void __user *src, unsigned size) -{ - might_fault(); - kasan_check_write(dst, size); - return __copy_from_user_nocheck(dst, src, size); -} - -static __always_inline __must_check -int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) +static __always_inline __must_check unsigned long +raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { int ret = 0; - check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -176,100 +162,16 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) } static __always_inline __must_check -int __copy_to_user(void __user *dst, const void *src, unsigned size) -{ - might_fault(); - kasan_check_read(src, size); - return __copy_to_user_nocheck(dst, src, size); -} - -static __always_inline __must_check -int __copy_in_user(void __user *dst, const void __user *src, unsigned size) -{ - int ret = 0; - - might_fault(); - if (!__builtin_constant_p(size)) - return copy_user_generic((__force void *)dst, - (__force void *)src, size); - switch (size) { - case 1: { - u8 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u8 __user *)src, - ret, "b", "b", "=q", 1); - if (likely(!ret)) - __put_user_asm(tmp, (u8 __user *)dst, - ret, "b", "b", "iq", 1); - __uaccess_end(); - return ret; - } - case 2: { - u16 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u16 __user *)src, - ret, "w", "w", "=r", 2); - if (likely(!ret)) - __put_user_asm(tmp, (u16 __user *)dst, - ret, "w", "w", "ir", 2); - __uaccess_end(); - return ret; - } - - case 4: { - u32 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u32 __user *)src, - ret, "l", "k", "=r", 4); - if (likely(!ret)) - __put_user_asm(tmp, (u32 __user *)dst, - ret, "l", "k", "ir", 4); - __uaccess_end(); - return ret; - } - case 8: { - u64 tmp; - __uaccess_begin(); - __get_user_asm(tmp, (u64 __user *)src, - ret, "q", "", "=r", 8); - if (likely(!ret)) - __put_user_asm(tmp, (u64 __user *)dst, - ret, "q", "", "er", 8); - __uaccess_end(); - return ret; - } - default: - return copy_user_generic((__force void *)dst, - (__force void *)src, size); - } -} - -static __must_check __always_inline int -__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) -{ - kasan_check_write(dst, size); - return __copy_from_user_nocheck(dst, src, size); -} - -static __must_check __always_inline int -__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) +unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) { - kasan_check_read(src, size); - return __copy_to_user_nocheck(dst, src, size); + return copy_user_generic((__force void *)dst, + (__force void *)src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); static inline int -__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) -{ - might_fault(); - kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 1); -} - -static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 72e8300b1e8a..9cffb44a3cf5 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -485,15 +485,17 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - paddr |= uv_hub_info->gnode_upper; - if (m_val) + + if (m_val) { + paddr |= uv_hub_info->gnode_upper; paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); - else + } else { paddr |= uv_soc_phys_ram_to_nasid(paddr) << uv_hub_info->gpa_shift; + } return paddr; } diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 41af92e473cf..ddef37b16af2 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -57,7 +57,7 @@ struct setup_header { __u32 header; __u16 version; __u32 realmode_swtch; - __u16 start_sys; + __u16 start_sys_seg; __u16 kernel_version; __u8 type_of_loader; __u8 loadflags; diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 835aa51c7f6e..c45765517092 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -1,10 +1,13 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a468f0fdc907..6bb680671088 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -180,10 +180,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -711,7 +716,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -1560,12 +1565,6 @@ int __init early_acpi_boot_init(void) return 0; } -static int __init acpi_parse_bgrt(struct acpi_table_header *table) -{ - efi_bgrt_init(table); - return 0; -} - int __init acpi_boot_init(void) { /* those are executed after early-quirks are executed */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4eca103a1e01..847650b14558 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void) TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; @@ -1610,24 +1614,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } #endif /* !CONFIG_X86_X2APIC */ -static int __init try_to_enable_IR(void) -{ -#ifdef CONFIG_X86_IO_APIC - if (!x2apic_enabled() && skip_ioapic_setup) { - pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); - return -1; - } -#endif - return irq_remapping_enable(); -} - void __init enable_IR_x2apic(void) { unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) + if (skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; + } ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) @@ -1645,7 +1640,7 @@ void __init enable_IR_x2apic(void) /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) - ir_stat = try_to_enable_IR(); + ir_stat = irq_remapping_enable(); /* ir_stat contains the remap mode or an error code */ try_to_enable_x2apic(ir_stat); @@ -2062,17 +2057,17 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "Only %d processors supported." + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids - 1, nr_logical_cpuids, apicid); - return -1; + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; } cpuid_to_apicid[nr_logical_cpuids] = apicid; return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2130,11 +2125,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2186,23 +2179,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7f179f1c467c..b487b3a01615 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -1106,7 +1106,8 @@ void __init uv_init_hub_info(struct uv_hub_info_s *hi) node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; - hi->gnode_upper = (unsigned long)hi->gnode_extra << mn.m_val; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; if (uv_gp_table) { hi->global_mmr_base = uv_gp_table->mmr_base; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 35a5d5dca2fa..c36140d788fe 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -556,10 +556,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 106bd3318121..44207b71fee1 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -105,8 +105,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif - - clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b11b38c3b0bd..58094a1f9e9d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -88,7 +88,6 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif - clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -1077,8 +1076,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); - else - clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 0a3bc19de017..a70fd61095f8 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -185,7 +185,6 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } - clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe0a615a051b..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) return; } - if (ring3mwait_disabled) { - msr_clear_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + if (ring3mwait_disabled) return; - } - - msr_set_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); if (c == &boot_cpu_data) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; @@ -162,10 +158,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (check_tsc_unstable()) - clear_sched_clock_stable(); - } else { - clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -492,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -606,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); - probe_xeon_phi_r3mwait(c); + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 0bbe0f3a039f..9ac2a5cdd9c2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -28,7 +28,6 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/slab.h> -#include <linux/cpu.h> #include <linux/task_work.h> #include <uapi/linux/magic.h> @@ -728,7 +727,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(kn); + kernfs_put(rdtgrp->kn); kfree(rdtgrp); } else { kernfs_unbreak_active_protection(kn); diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c index f369cb8db0d5..badd2b31a560 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -200,11 +200,11 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, } out: - rdtgroup_kn_unlock(of->kn); for_each_enabled_rdt_resource(r) { kfree(r->tmp_cbms); r->tmp_cbms = NULL; } + rdtgroup_kn_unlock(of->kn); return ret ?: nbytes; } diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index a3311c886194..43051f0777d4 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c new file mode 100644 index 000000000000..9c632cb88546 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -0,0 +1,397 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "mce-internal.h" + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +#define mce_log_get_idx_check(p) \ +({ \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ + smp_load_acquire(&(p)); \ +}) + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +/* User mode helper program triggered by machine check event */ +extern char mce_helper[128]; + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int next, entry; + + wmb(); + for (;;) { + entry = mce_log_get_idx_check(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return NOTIFY_OK; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + cpu_tsc[smp_processor_id()] = rdtsc(); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = mce_log_get_idx_check(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; + } + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 1e5a50c11d3c..217cd4449bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 903043e6a62b..654ad0668d72 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,7 +13,7 @@ enum severity_level { MCE_PANIC_SEVERITY, }; -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern struct blocking_notifier_head x86_mce_decoder_chain; #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) m1->addr != m2->addr || m1->misc != m2->misc; } + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +extern void mce_work_trigger(void); +#else +static inline void mce_work_trigger(void) { } +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8e9725c607ea..5abd4bf73d6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -35,6 +35,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> +#include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> @@ -49,18 +50,11 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/reboot.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) +static DEFINE_MUTEX(mce_log_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -85,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -/* User mode helper program triggered by machine check event */ -static unsigned long mce_need_notify; -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - static DEFINE_PER_CPU(struct mce, mces_seen); -static int cpu_missing; +static unsigned long mce_need_notify; +static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -143,82 +131,38 @@ void mce_setup(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -void mce_log(struct mce *mce) +void mce_log(struct mce *m) { - unsigned next, entry; - - /* Emit the trace record: */ - trace_mce_record(mce); - - if (!mce_gen_pool_add(mce)) + if (!mce_gen_pool_add(m)) irq_work_queue(&mce_irq_work); - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, &mce_need_notify); } void mce_inject_log(struct mce *m) { - mutex_lock(&mce_chrdev_read_mutex); + mutex_lock(&mce_log_mutex); mce_log(m); - mutex_unlock(&mce_chrdev_read_mutex); + mutex_unlock(&mce_log_mutex); } EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 static atomic_t num_notifiers; void mce_register_decode_chain(struct notifier_block *nb) { - atomic_inc(&num_notifiers); + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; - WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); + atomic_inc(&num_notifiers); - atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -226,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb) { atomic_dec(&num_notifiers); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -319,18 +263,7 @@ static void __print_mce(struct mce *m) static void print_mce(struct mce *m) { - int ret = 0; - __print_mce(m); - - /* - * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that) - */ - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - if (ret == NOTIFY_STOP) - return; - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } @@ -519,7 +452,6 @@ static void mce_schedule_work(void) static void mce_irq_work_cb(struct irq_work *entry) { - mce_notify_irq(); mce_schedule_work(); } @@ -548,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs) */ static int mce_usable_address(struct mce *m) { - if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + if (!(m->status & MCI_STATUS_ADDRV)) return 0; /* Checks after this one are Intel-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 1; + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; + return 1; } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + return (xec == 0x0 || xec == 0x8); + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (memory_error(m) && + !(m->status & MCI_STATUS_UC) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -591,11 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - /* - * Run the default notifier if we have only the SRAO - * notifier and us registered. - */ - if (atomic_read(&num_notifiers) > 2) + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) return NOTIFY_DONE; __print_mce(m); @@ -648,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} - DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -1127,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); @@ -1399,13 +1386,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI @@ -1417,11 +1397,7 @@ int mce_notify_irq(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); if (test_and_clear_bit(0, &mce_need_notify)) { - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - - if (mce_helper[0]) - schedule_work(&mce_trigger_work); + mce_work_trigger(); if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); @@ -1688,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return 0; } -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { + if (c->x86_vendor == X86_VENDOR_AMD) { mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - /* - * Install proper ops for Scalable MCA enabled processors - */ if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; msr_ops.status = smca_status_reg; msr_ops.addr = smca_addr_reg; msr_ops.misc = smca_misc_reg; } - mce_amd_feature_init(c); + } +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); break; } @@ -1798,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; + __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); @@ -1823,252 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c) } -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; - int i, err; - - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - next = mce_log_get_idx_check(mcelog.next); - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } - } - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); - - return err ? err : buf - ubuf; -} - -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); - -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) -{ - mce_write = fn; -} -EXPORT_SYMBOL_GPL(register_mce_write_callback); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else - return -EINVAL; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); @@ -2142,6 +1878,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); mce_register_decode_chain(&mce_srao_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); @@ -2286,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return size; } -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) @@ -2365,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); @@ -2388,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, +#endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, @@ -2562,7 +2277,6 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { - enum cpuhp_state hp_online; int err; if (!mce_available(&boot_cpu_data)) { @@ -2590,21 +2304,11 @@ static __init int mcheck_init_device(void) mce_cpu_online, mce_cpu_pre_down); if (err < 0) goto err_out_online; - hp_online = err; register_syscore_ops(&mce_syscore_ops); - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) - goto err_register; - return 0; -err_register: - unregister_syscore_ops(&mce_syscore_ops); - cpuhp_remove_state(hp_online); - err_out_online: cpuhp_remove_state(CPUHP_X86_MCE_DEAD); @@ -2612,7 +2316,7 @@ err_out_mem: free_cpumask_var(mce_device_initialized); err_out: - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + pr_err("Unable to init MCE device (rc: %d)\n", err); return err; } @@ -2691,6 +2395,7 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); + cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 524cc5780a77..6e4a047e4b68 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -60,7 +60,7 @@ static const char * const th_names[] = { "load_store", "insn_fetch", "combined_unit", - "", + "decode_unit", "northbridge", "execution_unit", }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 190b3e6cef4d..e84db79ef272 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) return; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 8457b4978668..d77d07ab310b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -16,8 +16,6 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } - - clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 891f4dad7b2c..22403a28caf5 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,7 +30,6 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> -#include <asm/timer.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8639bb2ae058..5b7153540727 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -535,7 +535,7 @@ static void run_sync(void) { int enable_irqs = irqs_disabled(); - /* We may be called with interrupts disbled (on bootup). */ + /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) local_irq_enable(); on_each_cpu(do_sync_core, NULL, 1); @@ -983,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long return_hooker = (unsigned long) &return_to_handler; + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + if (unlikely(ftrace_graph_is_dead())) return; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index baa0e7b78d80..43b7002f44fb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ +#define DISABLE_BRANCH_PROFILING #include <linux/init.h> #include <linux/linkage.h> #include <linux/types.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dc6ba5bda9fc..89ff7af2de50 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -354,7 +354,7 @@ static int hpet_resume(struct clock_event_device *evt, int timer) irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); + disable_hardirq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index bdb83e431d89..38b64587b31b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void) struct dentry *dbp, *version, *data; int error = -ENOMEM; - dbp = debugfs_create_dir("boot_params", NULL); + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); if (!dbp) return -ENOMEM; diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..db2182d63ed0 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); @@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src); +extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); /* Generate a relative-jump/call instruction */ extern void synthesize_reljump(void *from, void *to); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6384eb754a58..19e1f2a6d7b0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -164,42 +164,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) NOKPROBE_SYMBOL(skip_prefixes); /* - * Returns non-zero if opcode is boostable. + * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], (unsigned long *)twobyte_is_boostable); - } + + if (insn->opcode.nbytes != 1) + return 0; + + /* Can't boost Address-size override prefix */ + if (unlikely(inat_is_address_size_prefix(insn->attr))) + return 0; + + opcode = insn->opcode.bytes[0]; switch (opcode & 0xf0) { -#ifdef CONFIG_X86_64 - case 0x40: - goto retry; /* REX prefix is boostable */ -#endif case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); + /* can't boost "bound" */ + return (opcode != 0x62); case 0x70: return 0; /* can't boost conditional jump */ + case 0x90: + return opcode != 0x9a; /* can't boost call far */ case 0xc0: /* can't boost software-interruptions */ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; @@ -210,14 +206,9 @@ retry: /* can boost in/out and absolute jmps */ return ((opcode & 0x04) || opcode == 0xea); case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ /* clear and set flags are boostable */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* segment override prefixes are boostable */ - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ /* CS override prefix and call are not boostable */ return (opcode != 0x2e && opcode != 0x9a); } @@ -264,7 +255,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (faddr) memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); else @@ -276,7 +270,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. - * Returns zero if the instruction can not get recovered. + * Returns zero if the instruction can not get recovered (or access failed). */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -348,37 +342,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn) } /* - * Copy an instruction and adjust the displacement if the instruction - * uses the %rip-relative addressing mode. - * If it does, Return the address of the 32-bit displacement word. - * If not, return null. - * Only applicable to 64-bit x86. + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) { - struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int length; unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); - if (!recovered_insn) + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - length = insn.length; + + kernel_insn_init(insn, dest, MAX_INSN_SIZE); + insn_get_length(insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 - if (insn_rip_relative(&insn)) { + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest, length); - insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing * mode. Adjust the displacement for the difference between @@ -391,36 +384,57 @@ int __copy_instruction(u8 *dest, u8 *src) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) dest; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", + src, dest, insn->displacement.value); return 0; } - disp = (u8 *) dest + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(insn); *(s32 *) disp = (s32) newdisp; } #endif - return length; + return insn->length; +} + +/* Prepare reljump right after instruction to boost */ +static void prepare_boost(struct kprobe *p, struct insn *insn) +{ + if (can_boost(insn, p->addr) && + MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(p->ainsn.insn + insn->length, + p->addr + insn->length); + p->ainsn.boostable = true; + } else { + p->ainsn.boostable = false; + } } static int arch_copy_kprobe(struct kprobe *p) { - int ret; + struct insn insn; + int len; + + set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Copy an instruction with recovering if other optprobe modifies it.*/ - ret = __copy_instruction(p->ainsn.insn, p->addr); - if (!ret) + len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + if (!len) return -EINVAL; /* * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) - p->ainsn.boostable = 0; - else - p->ainsn.boostable = -1; + prepare_boost(p, &insn); + + set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); /* Check whether the instruction modifies Interrupt Flag or not */ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); @@ -459,7 +473,7 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } } @@ -531,7 +545,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPT) - if (p->ainsn.boostable == 1 && !p->post_handler) { + if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -851,7 +865,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, case 0xcf: case 0xea: /* jmp absolute -- ip is correct */ /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; case 0xe8: /* call relative - Fix return addr */ *tos = orig_ip + (*tos - copy_ip); @@ -876,28 +890,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, * jmp near and far, absolute indirect * ip is correct. And this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.boostable = true; goto no_change; } default: break; } - if (p->ainsn.boostable == 0) { - if ((regs->ip > copy_ip) && - (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - synthesize_reljump((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - regs->ip += orig_ip - copy_ip; no_change: diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 5f8f0b3cc674..041f7b6dfa0f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; - p->ainsn.boostable = -1; + p->ainsn.boostable = false; return 0; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3d1bee9d6a72..9aadff3d0902 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -65,7 +65,10 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (probe_kernel_read(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + if (addr == (unsigned long)kp->addr) { buf[0] = kp->opcode; memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); @@ -174,11 +177,12 @@ NOKPROBE_SYMBOL(optimized_callback); static int copy_optimized_instructions(u8 *dest, u8 *src) { + struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + ret = __copy_instruction(dest + len, src + len, &insn); + if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; } @@ -350,6 +354,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, } buf = (u8 *)op->optinsn.insn; + set_memory_rw((unsigned long)buf & PAGE_MASK, 1); /* Copy instructions into the out-of-line buffer */ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); @@ -372,6 +377,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, (u8 *)op->kp.addr + op->optinsn.size); + set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + flush_icache_range((unsigned long) buf, (unsigned long) buf + TMPL_END_IDX + op->optinsn.size + RELATIVEJUMP_SIZE); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..857cdbd02867 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -194,19 +194,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f088ea4c66e7..446c8aa09b9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -166,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* - * most handlers of type NMI_UNKNOWN never return because - * they just assume the NMI is theirs. Just a sanity check - * to manage expectations + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). */ - WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); @@ -224,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - if (panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f67591561711..0bb88428cbf2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -124,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -137,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -154,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - /* * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ refresh_tss_limit(); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -550,3 +616,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4c818f8bc135..ff40e74c9181 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -56,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d6b784a5520d..ea1a6180bf39 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -204,7 +205,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -547,70 +548,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -621,10 +624,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2364b23ea3e5..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -396,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e244c19a2451..2544700a2a87 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, /* Certec */ { /* Handle problems with rebooting on Certec BPC600 */ @@ -749,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 396c042e9d0e..cc30a74e4adb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -846,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ec1f756f9dc9..71beb28600d4 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, if (from->si_signo == SIGSEGV) { if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)&to->si_lower; - compat_uptr_t upper = (unsigned long)&to->si_upper; + compat_uptr_t lower = (unsigned long)from->si_lower; + compat_uptr_t upper = (unsigned long)from->si_upper; put_user_ex(lower, &to->si_lower); put_user_ex(upper, &to->si_upper); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 9d7223cad389..d798c0da451c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -125,7 +125,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 948443e115c1..4e496379a871 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -255,7 +255,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } @@ -519,7 +519,7 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); + print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46bcda4cb1c2..714dfba6a1e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -327,9 +327,16 @@ unsigned long long sched_clock(void) { return paravirt_sched_clock(); } + +bool using_native_sched_clock(void) +{ + return pv_time_ops.sched_clock == native_sched_clock; +} #else unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); + +bool using_native_sched_clock(void) { return true; } #endif int check_tsc_unstable(void) @@ -1112,8 +1119,10 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; + tsc_unstable = 1; - clear_sched_clock_stable(); + if (using_native_sched_clock()) + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } @@ -1135,18 +1144,20 @@ static struct clocksource clocksource_tsc = { void mark_tsc_unstable(char *reason) { - if (!tsc_unstable) { - tsc_unstable = 1; + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); - pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_mark_unstable(&clocksource_tsc); - else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) { + clocksource_mark_unstable(&clocksource_tsc); + } else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; + clocksource_tsc.rating = 0; } } @@ -1322,6 +1333,8 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 478d15dbaee4..08339262b666 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -82,19 +82,43 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + static bool is_last_task_frame(struct unwind_state *state) { - unsigned long bp = (unsigned long)state->bp; - unsigned long regs = (unsigned long)task_pt_regs(state->task); + unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2; + unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS; /* * We have to check for the last task frame at two different locations * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame by a word in the prologue of a - * function called by head/entry code. + * change the offset of the stack frame in the prologue of a function + * called by head/entry code. Examples: + * + * <start_secondary>: + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * <x86_64_start_kernel>: + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * Note that after aligning the stack, it pushes a duplicate copy of + * the return address before pushing the frame pointer. */ - return bp == regs - FRAME_HEADER_SIZE || - bp == regs - FRAME_HEADER_SIZE - sizeof(long); + return (state->bp == last_bp || + (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); } /* diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 73ea24d4f119..047b17a26269 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -657,6 +657,9 @@ void kvm_pic_destroy(struct kvm *kvm) { struct kvm_pic *vpic = kvm->arch.vpic; + if (!vpic) + return; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5c07d2..289270a6aecb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -635,6 +635,9 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + if (!ioapic) + return; + cancel_delayed_work_sync(&ioapic->eoi_inject); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 37942e419c32..60168cdd0546 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } +void kvm_page_track_cleanup(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + cleanup_srcu_struct(&head->track_srcu); +} + void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..5fba70646c32 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1379,6 +1379,9 @@ static void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + if (!avic) + return; + avic_free_vm_id(vm_data->avic_vm_id); if (vm_data->avic_logical_id_table_page) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 283aa8601833..259e9b28ccf8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1239,6 +1239,11 @@ static inline bool cpu_has_vmx_invvpid_global(void) return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -2753,7 +2758,6 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -2781,10 +2785,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ - if (enable_vpid) + if (enable_vpid) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - else + } else vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) @@ -4024,6 +4030,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); } +static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) +{ + if (enable_ept) + vmx_flush_tlb(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -6517,8 +6529,10 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); - if (!cpu_has_vmx_vpid()) + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) @@ -7258,9 +7272,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; gpa_t vmptr; - struct vmcs12 *vmcs12; - struct page *page; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7271,22 +7284,9 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - /* - * For accurate processor emulation, VMCLEAR beyond available - * physical memory should do nothing at all. However, it is - * possible that a nested vmx bug, not a guest hypervisor bug, - * resulted in this case, so let's shut down before doing any - * more damage: - */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 1; - } - vmcs12 = kmap(page); - vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); nested_free_vmcs02(vmx, vmptr); @@ -8198,6 +8198,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PREEMPTION_TIMER: return false; + case EXIT_REASON_PML_FULL: + /* We don't expose PML support to L1. */ + return false; default: return true; } @@ -8515,7 +8518,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8561,6 +8565,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb_ept_only(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -8586,8 +8591,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) */ if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(&vmx->vcpu), - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb_ept_only(vcpu); + } } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -9694,10 +9701,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) { - WARN_ON(1); + if (!page) return false; - } msr_bitmap_l1 = (unsigned long *)kmap(page); memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -9990,7 +9995,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10137,8 +10141,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_intr_status); } - nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; - /* * Write an illegal value to APIC_ACCESS_ADDR. Later, * nested_get_vmcs12_pages will either fix it up or @@ -10268,9 +10270,24 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } + if (enable_pml) { + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit, this happens + * to be equivalent to simply resetting the fields in vmcs02. + */ + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + if (nested_cpu_has_ept(vmcs12)) { kvm_mmu_unload(vcpu); nested_ept_init_mmu_context(vcpu); + } else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* @@ -10298,12 +10315,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_efer(vcpu, vcpu->arch.efer); /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) return 1; - kvm_mmu_reset_context(vcpu); - if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -11072,6 +11087,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx->nested.change_vmcs01_virtual_x2apic_mode = false; vmx_set_virtual_x2apic_mode(vcpu, vcpu->arch.apic_base & X2APIC_ENABLE); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb_ept_only(vcpu); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -11121,8 +11140,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ static void vmx_leave_nested(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); + } free_nested(to_vmx(vcpu)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1faf620a6fdc..ccbd45ecd41a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8153,11 +8153,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); kvm_iommu_unmap_guest(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); + kvm_pic_destroy(kvm); + kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); + kvm_page_track_cleanup(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -8566,11 +8567,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_ready(work->arch.token, work->gva); if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + trace_kvm_async_pf_ready(work->arch.token, work->gva); if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7aa633686295..99472698c931 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -994,7 +994,9 @@ static struct clock_event_device lguest_clockevent = { .mult = 1, .shift = 0, .min_delta_ns = LG_CLOCK_MIN_DELTA, + .min_delta_ticks = LG_CLOCK_MIN_DELTA, .max_delta_ns = LG_CLOCK_MAX_DELTA, + .max_delta_ticks = LG_CLOCK_MAX_DELTA, }; /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index a8e91ae89fb3..29df077cb089 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops) { u64 start, end, delay, loops = __loops; + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (loops == 0) + return; + start = rdtsc_ordered(); for (;;) { diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 779782f58324..9a53a06e5a3e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -290,7 +290,7 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) - _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) + _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c074799bddae..c8c6ad0d58b8 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -4,12 +4,9 @@ * For licencing details see kernel-base/COPYING */ -#include <linux/highmem.h> +#include <linux/uaccess.h> #include <linux/export.h> -#include <asm/word-at-a-time.h> -#include <linux/sched.h> - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -34,52 +31,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1f65ff6540f0..bd057a4ffe6e 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -5,12 +5,7 @@ * Copyright 1997 Andi Kleen <ak@muc.de> * Copyright 1997 Linus Torvalds */ -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/blkdev.h> #include <linux/export.h> -#include <linux/backing-dev.h> -#include <linux/interrupt.h> #include <linux/uaccess.h> #include <asm/mmx.h> #include <asm/asm.h> @@ -201,197 +196,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) return size; } -static unsigned long -__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) -{ - int d0, d1; - __asm__ __volatile__( - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" - " jbe 2f\n" - "1: movl 64(%4), %%eax\n" - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" - " movl %%eax, 0(%3)\n" - " movl %%edx, 4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" - " movl %%eax, 8(%3)\n" - " movl %%edx, 12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" - " movl %%eax, 16(%3)\n" - " movl %%edx, 20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" - " movl %%eax, 24(%3)\n" - " movl %%edx, 28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" - " movl %%eax, 32(%3)\n" - " movl %%edx, 36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" - " movl %%eax, 40(%3)\n" - " movl %%edx, 44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" - " movl %%eax, 48(%3)\n" - " movl %%edx, 52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" - " movl %%eax, 56(%3)\n" - " movl %%edx, 60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" - " cmpl $63, %0\n" - " ja 0b\n" - "5: movl %0, %%eax\n" - " shrl $2, %0\n" - " andl $3, %%eax\n" - " cld\n" - "6: rep; movsl\n" - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" - ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" - " pushl %%eax\n" - " xorl %%eax,%%eax\n" - " rep; stosb\n" - " popl %%eax\n" - " popl %0\n" - " jmp 8b\n" - ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) - : "=&c"(size), "=&D" (d0), "=&S" (d1) - : "1"(to), "2"(from), "0"(size) - : "eax", "edx", "memory"); - return size; -} - -/* - * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware. - * hyoshiok@miraclelinux.com - */ - -static unsigned long __copy_user_zeroing_intel_nocache(void *to, - const void __user *from, unsigned long size) -{ - int d0, d1; - - __asm__ __volatile__( - " .align 2,0x90\n" - "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" - " jbe 2f\n" - "1: movl 64(%4), %%eax\n" - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" - " movnti %%eax, 0(%3)\n" - " movnti %%edx, 4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" - " movnti %%eax, 8(%3)\n" - " movnti %%edx, 12(%3)\n" - "4: movl 16(%4), %%eax\n" - "41: movl 20(%4), %%edx\n" - " movnti %%eax, 16(%3)\n" - " movnti %%edx, 20(%3)\n" - "10: movl 24(%4), %%eax\n" - "51: movl 28(%4), %%edx\n" - " movnti %%eax, 24(%3)\n" - " movnti %%edx, 28(%3)\n" - "11: movl 32(%4), %%eax\n" - "61: movl 36(%4), %%edx\n" - " movnti %%eax, 32(%3)\n" - " movnti %%edx, 36(%3)\n" - "12: movl 40(%4), %%eax\n" - "71: movl 44(%4), %%edx\n" - " movnti %%eax, 40(%3)\n" - " movnti %%edx, 44(%3)\n" - "13: movl 48(%4), %%eax\n" - "81: movl 52(%4), %%edx\n" - " movnti %%eax, 48(%3)\n" - " movnti %%edx, 52(%3)\n" - "14: movl 56(%4), %%eax\n" - "91: movl 60(%4), %%edx\n" - " movnti %%eax, 56(%3)\n" - " movnti %%edx, 60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" - " cmpl $63, %0\n" - " ja 0b\n" - " sfence \n" - "5: movl %0, %%eax\n" - " shrl $2, %0\n" - " andl $3, %%eax\n" - " cld\n" - "6: rep; movsl\n" - " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" - ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" - " pushl %%eax\n" - " xorl %%eax,%%eax\n" - " rep; stosb\n" - " popl %%eax\n" - " popl %0\n" - " jmp 8b\n" - ".previous\n" - _ASM_EXTABLE(0b,16b) - _ASM_EXTABLE(1b,16b) - _ASM_EXTABLE(2b,16b) - _ASM_EXTABLE(21b,16b) - _ASM_EXTABLE(3b,16b) - _ASM_EXTABLE(31b,16b) - _ASM_EXTABLE(4b,16b) - _ASM_EXTABLE(41b,16b) - _ASM_EXTABLE(10b,16b) - _ASM_EXTABLE(51b,16b) - _ASM_EXTABLE(11b,16b) - _ASM_EXTABLE(61b,16b) - _ASM_EXTABLE(12b,16b) - _ASM_EXTABLE(71b,16b) - _ASM_EXTABLE(13b,16b) - _ASM_EXTABLE(81b,16b) - _ASM_EXTABLE(14b,16b) - _ASM_EXTABLE(91b,16b) - _ASM_EXTABLE(6b,9b) - _ASM_EXTABLE(7b,16b) - : "=&c"(size), "=&D" (d0), "=&S" (d1) - : "1"(to), "2"(from), "0"(size) - : "eax", "edx", "memory"); - return size; -} - static unsigned long __copy_user_intel_nocache(void *to, const void __user *from, unsigned long size) { @@ -486,12 +290,8 @@ static unsigned long __copy_user_intel_nocache(void *to, * Leave these declared but undefined. They should not be any references to * them */ -unsigned long __copy_user_zeroing_intel(void *to, const void __user *from, - unsigned long size); unsigned long __copy_user_intel(void __user *to, const void *from, unsigned long size); -unsigned long __copy_user_zeroing_intel_nocache(void *to, - const void __user *from, unsigned long size); #endif /* CONFIG_X86_INTEL_USERCOPY */ /* Generic arbitrary sized copy. */ @@ -528,47 +328,7 @@ do { \ : "memory"); \ } while (0) -#define __copy_user_zeroing(to, from, size) \ -do { \ - int __d0, __d1, __d2; \ - __asm__ __volatile__( \ - " cmp $7,%0\n" \ - " jbe 1f\n" \ - " movl %1,%0\n" \ - " negl %0\n" \ - " andl $7,%0\n" \ - " subl %0,%3\n" \ - "4: rep; movsb\n" \ - " movl %3,%0\n" \ - " shrl $2,%0\n" \ - " andl $3,%3\n" \ - " .align 2,0x90\n" \ - "0: rep; movsl\n" \ - " movl %3,%0\n" \ - "1: rep; movsb\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "5: addl %3,%0\n" \ - " jmp 6f\n" \ - "3: lea 0(%3,%0,4),%0\n" \ - "6: pushl %0\n" \ - " pushl %%eax\n" \ - " xorl %%eax,%%eax\n" \ - " rep; stosb\n" \ - " popl %%eax\n" \ - " popl %0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(4b,5b) \ - _ASM_EXTABLE(0b,3b) \ - _ASM_EXTABLE(1b,6b) \ - : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ - : "3"(size), "0"(size), "1"(to), "2"(from) \ - : "memory"); \ -} while (0) - -unsigned long __copy_to_user_ll(void __user *to, const void *from, - unsigned long n) +unsigned long __copy_user_ll(void *to, const void *from, unsigned long n) { stac(); if (movsl_is_ok(to, from, n)) @@ -578,51 +338,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from, clac(); return n; } -EXPORT_SYMBOL(__copy_to_user_ll); - -unsigned long __copy_from_user_ll(void *to, const void __user *from, - unsigned long n) -{ - stac(); - if (movsl_is_ok(to, from, n)) - __copy_user_zeroing(to, from, n); - else - n = __copy_user_zeroing_intel(to, from, n); - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll); - -unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, - unsigned long n) -{ - stac(); - if (movsl_is_ok(to, from, n)) - __copy_user(to, from, n); - else - n = __copy_user_intel((void __user *)to, - (const void *)from, n); - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll_nozero); - -unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, - unsigned long n) -{ - stac(); -#ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) - n = __copy_user_zeroing_intel_nocache(to, from, n); - else - __copy_user_zeroing(to, from, n); -#else - __copy_user_zeroing(to, from, n); -#endif - clac(); - return n; -} -EXPORT_SYMBOL(__copy_from_user_ll_nocache); +EXPORT_SYMBOL(__copy_user_ll); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 69873589c0ba..3b7c40a2e3e1 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -54,15 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) -{ - if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { - return copy_user_generic((__force void *)to, (__force void *)from, len); - } - return len; -} -EXPORT_SYMBOL(copy_in_user); - /* * Try to copy last bytes and clear the rest if needed. * Since protection fault in copy_from/to_user is not a normal situation, @@ -80,9 +71,5 @@ copy_user_handle_tail(char *to, char *from, unsigned len) break; } clac(); - - /* If the destination is a kernel buffer, we always clear the end */ - if (!__addr_ok(to)) - memset(to, 0, len); return len; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 99c7805a9693..1f3b6ef105cd 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -106,32 +106,35 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; - int nr_start = *nr; - pte_t *ptep; + int nr_start = *nr, ret = 0; + pte_t *ptep, *ptem; - ptep = pte_offset_map(&pmd, addr); + /* + * Keep the original mapped PTE value (ptem) around since we + * might increment ptep off the end of the page when finishing + * our loop iteration. + */ + ptem = ptep = pte_offset_map(&pmd, addr); do { pte_t pte = gup_get_pte(ptep); struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) { - pte_unmap(ptep); - return 0; - } + if (pte_protnone(pte)) + break; + + if (!pte_allows_gup(pte_val(pte), write)) + break; if (pte_devmap(pte)) { pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); - pte_unmap(ptep); - return 0; + break; } - } else if (!pte_allows_gup(pte_val(pte), write) || - pte_special(pte)) { - pte_unmap(ptep); - return 0; - } + } else if (pte_special(pte)) + break; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); @@ -141,9 +144,11 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); + if (addr == end) + ret = 1; + pte_unmap(ptem); - return 1; + return ret; } static inline void get_head_page_multiple(struct page *page, int nr) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2193799ca800..138bad2fb6bc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -643,21 +643,40 @@ void __init init_mem_mapping(void) * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains BIOS code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. + * On x86, access has to be given to the first megabyte of RAM because that + * area traditionally contains BIOS code and data regions used by X, dosemu, + * and similar apps. Since they map the entire memory range, the whole range + * must be allowed (for mapping), but any areas that would otherwise be + * disallowed are flagged as being "zero filled" instead of rejected. + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr < 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + if (page_is_ram(pagenr)) { + /* + * For disallowed memory regions in the low 1MB range, + * request that the page be shown as all zeros. + */ + if (pagenr < 256) + return 2; + + return 0; + } + + /* + * This must follow RAM test, since System RAM is considered a + * restricted resource under CONFIG_STRICT_IOMEM. + */ + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { + /* Low 1MB bypasses iomem restrictions. */ + if (pagenr < 256) + return 1; + return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; + } + + return 1; } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 66d20174e35f..da92df32d0f1 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include <linux/bootmem.h> #include <linux/kasan.h> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 887e57182716..aed206475aa7 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -48,7 +48,7 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; #if defined(CONFIG_X86_ESPFIX64) static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; #elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_START; +static const unsigned long vaddr_end = EFI_VA_END; #else static const unsigned long vaddr_end = __START_KERNEL_map; #endif @@ -105,7 +105,7 @@ void __init kernel_randomize_memory(void) */ BUILD_BUG_ON(vaddr_start >= vaddr_end); BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_START); + vaddr_end >= EFI_VA_END); BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 5126dfd52b18..cd44ae727df7 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -590,7 +590,7 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, * we might run off the end of the bounds table if we are on * a 64-bit kernel and try to get 8 bytes. */ -int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, +static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, long __user *bd_entry_ptr) { u32 bd_entry_32; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 0cb52ae0a8f0..190e718694b1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -735,6 +735,15 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC +void pcibios_release_device(struct pci_dev *dev) +{ + if (atomic_dec_return(&dev->enable_cnt) >= 0) + pcibios_disable_device(dev); + +} +#endif + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index e1fb269c87af..292ab0364a89 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -234,23 +234,14 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; for_each_pci_msi_entry(msidesc, dev) { - __pci_read_msi_msg(msidesc, &msg); - pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | - ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (msg.data != XEN_PIRQ_MSI_DATA || - xen_irq_from_pirq(pirq) < 0) { - pirq = xen_allocate_pirq_msi(dev, msidesc); - if (pirq < 0) { - irq = -ENODEV; - goto error; - } - xen_msi_compose_msg(dev, pirq, &msg); - __pci_write_msi_msg(msidesc, &msg); - dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); - } else { - dev_dbg(&dev->dev, - "xen: msi already bound to pirq=%d\n", pirq); + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) { + irq = -ENODEV; + goto error; } + xen_msi_compose_msg(dev, pirq, &msg); + __pci_write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 066619b0700c..f1d83b34c329 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,6 +1,5 @@ OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o -obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c deleted file mode 100644 index 04ca8764f0c0..000000000000 --- a/arch/x86/platform/efi/efi-bgrt.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2012 Intel Corporation - * Author: Josh Triplett <josh@joshtriplett.org> - * - * Based on the bgrt driver: - * Copyright 2012 Red Hat, Inc <mjg@redhat.com> - * Author: Matthew Garrett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <linux/efi.h> -#include <linux/efi-bgrt.h> - -struct acpi_table_bgrt bgrt_tab; -size_t __initdata bgrt_image_size; - -struct bmp_header { - u16 id; - u32 size; -} __packed; - -void __init efi_bgrt_init(struct acpi_table_header *table) -{ - void *image; - struct bmp_header bmp_header; - struct acpi_table_bgrt *bgrt = &bgrt_tab; - - if (acpi_disabled) - return; - - if (table->length < sizeof(bgrt_tab)) { - pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", - table->length, sizeof(bgrt_tab)); - return; - } - *bgrt = *(struct acpi_table_bgrt *)table; - if (bgrt->version != 1) { - pr_notice("Ignoring BGRT: invalid version %u (expected 1)\n", - bgrt->version); - goto out; - } - if (bgrt->status & 0xfe) { - pr_notice("Ignoring BGRT: reserved status bits are non-zero %u\n", - bgrt->status); - goto out; - } - if (bgrt->image_type != 0) { - pr_notice("Ignoring BGRT: invalid image type %u (expected 0)\n", - bgrt->image_type); - goto out; - } - if (!bgrt->image_address) { - pr_notice("Ignoring BGRT: null image address\n"); - goto out; - } - - image = early_memremap(bgrt->image_address, sizeof(bmp_header)); - if (!image) { - pr_notice("Ignoring BGRT: failed to map image header memory\n"); - goto out; - } - - memcpy(&bmp_header, image, sizeof(bmp_header)); - early_memunmap(image, sizeof(bmp_header)); - if (bmp_header.id != 0x4d42) { - pr_notice("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n", - bmp_header.id); - goto out; - } - bgrt_image_size = bmp_header.size; - efi_mem_reserve(bgrt->image_address, bgrt_image_size); - - return; -out: - memset(bgrt, 0, sizeof(bgrt_tab)); -} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 2ee7694362a5..642a8698ad61 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,7 +47,7 @@ #include <asm/pgalloc.h> /* - * We allocate runtime services regions bottom-up, starting from -4G, i.e. + * We allocate runtime services regions top-down, starting from -4G, i.e. * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ static u64 efi_va = EFI_VA_START; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 3c8d8e511fd4..26615991d69c 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -203,6 +203,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } + /* No need to reserve regions that will never be freed. */ + if (md.attribute & EFI_MEMORY_RUNTIME) + return; + size += addr % EFI_PAGE_SIZE; size = round_up(size, EFI_PAGE_SIZE); addr = round_down(addr, EFI_PAGE_SIZE); diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index a7dbec4dce27..3dbde04febdc 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -26,5 +26,6 @@ obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o +obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c new file mode 100644 index 000000000000..a6c3705a28ad --- /dev/null +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c @@ -0,0 +1,82 @@ +/* + * Intel Merrifield power button support + * + * (C) Copyright 2017 Intel Corporation + * + * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/platform_device.h> +#include <linux/sfi.h> + +#include <asm/intel-mid.h> +#include <asm/intel_scu_ipc.h> + +static struct resource mrfld_power_btn_resources[] = { + { + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mrfld_power_btn_dev = { + .name = "msic_power_btn", + .id = PLATFORM_DEVID_NONE, + .num_resources = ARRAY_SIZE(mrfld_power_btn_resources), + .resource = mrfld_power_btn_resources, +}; + +static int mrfld_power_btn_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&mrfld_power_btn_dev); + return 0; + } + + return platform_device_register(&mrfld_power_btn_dev); +} + +static struct notifier_block mrfld_power_btn_scu_notifier = { + .notifier_call = mrfld_power_btn_scu_status_change, +}; + +static int __init register_mrfld_power_btn(void) +{ + if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER) + return -ENODEV; + + /* + * We need to be sure that the SCU IPC is ready before + * PMIC power button device can be registered: + */ + intel_scu_notifier_add(&mrfld_power_btn_scu_notifier); + + return 0; +} +arch_initcall(register_mrfld_power_btn); + +static void __init *mrfld_power_btn_platform_data(void *info) +{ + struct resource *res = mrfld_power_btn_resources; + struct sfi_device_table_entry *pentry = info; + + res->start = res->end = pentry->irq; + return NULL; +} + +static const struct devs_id mrfld_power_btn_dev_id __initconst = { + .name = "bcove_power_btn", + .type = SFI_DEV_TYPE_IPC, + .delay = 1, + .msic = 1, + .get_platform_data = &mrfld_power_btn_platform_data, +}; + +sfi_device(mrfld_power_btn_dev_id); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 86edd1e941eb..9e304e2ea4f5 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -19,7 +19,7 @@ #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> -#define TANGIER_EXT_TIMER0_MSI 15 +#define TANGIER_EXT_TIMER0_MSI 12 static struct platform_device wdt_dev = { .name = "intel_mid_wdt", diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index e793fe509971..e42978d4deaf 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -17,16 +17,6 @@ #include "intel_mid_weak_decls.h" -static void penwell_arch_setup(void); -/* penwell arch ops */ -static struct intel_mid_ops penwell_ops = { - .arch_setup = penwell_arch_setup, -}; - -static void mfld_power_off(void) -{ -} - static unsigned long __init mfld_calibrate_tsc(void) { unsigned long fast_calibrate; @@ -63,9 +53,12 @@ static unsigned long __init mfld_calibrate_tsc(void) static void __init penwell_arch_setup(void) { x86_platform.calibrate_tsc = mfld_calibrate_tsc; - pm_power_off = mfld_power_off; } +static struct intel_mid_ops penwell_ops = { + .arch_setup = penwell_arch_setup, +}; + void *get_penwell_ops(void) { return &penwell_ops; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 766d4d3529a1..f25982cdff90 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1847,7 +1847,6 @@ static void pq_init(int node, int pnode) ops.write_payload_first(pnode, first); ops.write_payload_last(pnode, last); - ops.write_g_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 2ee7632d4916..b082d71b08ee 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -390,9 +390,11 @@ static __init int uv_rtc_setup_clock(void) clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / sn_rtc_cycles_per_second; + clock_event_device_uv.min_delta_ticks = 1; clock_event_device_uv.max_delta_ns = clocksource_uv.mask * (NSEC_PER_SEC / sn_rtc_cycles_per_second); + clock_event_device_uv.max_delta_ticks = clocksource_uv.mask; rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa0ad43..7dbdb780264d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 25e068ba3382..470edad96bb9 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -10,21 +10,19 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/bug.h> +#include <asm/purgatory.h> + #include "sha256.h" #include "../boot/string.h" -struct sha_region { - unsigned long start; - unsigned long len; -}; - -unsigned long backup_dest = 0; -unsigned long backup_src = 0; -unsigned long backup_sz = 0; +unsigned long purgatory_backup_dest __section(.kexec-purgatory); +unsigned long purgatory_backup_src __section(.kexec-purgatory); +unsigned long purgatory_backup_sz __section(.kexec-purgatory); -u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; +u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); -struct sha_region sha_regions[16] = {}; +struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); /* * On x86, second kernel requries first 640K of memory to boot. Copy @@ -33,26 +31,28 @@ struct sha_region sha_regions[16] = {}; */ static int copy_backup_region(void) { - if (backup_dest) - memcpy((void *)backup_dest, (void *)backup_src, backup_sz); - + if (purgatory_backup_dest) { + memcpy((void *)purgatory_backup_dest, + (void *)purgatory_backup_src, purgatory_backup_sz); + } return 0; } -int verify_sha256_digest(void) +static int verify_sha256_digest(void) { - struct sha_region *ptr, *end; + struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; struct sha256_state sctx; sha256_init(&sctx); - end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; - for (ptr = sha_regions; ptr < end; ptr++) + end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); + + for (ptr = purgatory_sha_regions; ptr < end; ptr++) sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); sha256_final(&sctx, digest); - if (memcmp(digest, sha256_digest, sizeof(digest))) + if (memcmp(digest, purgatory_sha256_digest, sizeof(digest))) return 1; return 0; diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index fe3c91ba1bd0..dfae9b9e60b5 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -9,6 +9,7 @@ * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ +#include <asm/purgatory.h> .text .globl purgatory_start diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h index bd15a4127735..2867d9825a57 100644 --- a/arch/x86/purgatory/sha256.h +++ b/arch/x86/purgatory/sha256.h @@ -10,7 +10,6 @@ #ifndef SHA256_H #define SHA256_H - #include <linux/types.h> #include <crypto/sha.h> diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 0bc60a308730..2a2d89d39af6 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -7,3 +7,17 @@ config MCE_AMD_INJ aspects of the MCE handling code. WARNING: Do not even assume this interface is staying stable! + +config RAS_CEC + bool "Correctable Errors Collector" + depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS + ---help--- + This is a small cache which collects correctable memory errors per 4K + page PFN and counts their repeated occurrence. Once the counter for a + PFN overflows, we try to soft-offline that page as we take it to mean + that it has reached a relatively high error count and would probably + be best if we don't use it anymore. + + Bear in mind that this is absolutely useless if your platform doesn't + have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. + diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index e7e7055a8658..69f0827d5f53 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \ ifeq ($(CONFIG_X86_32),y) -obj-y += checksum_32.o +obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index e59eef20647b..b291ca5cf66b 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx, return -ENOSYS; } -extern long arch_prctl(struct task_struct *task, int code, +extern long arch_prctl(struct task_struct *task, int option, unsigned long __user *addr); #endif diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c index 96eb2bd28832..8431e87ac333 100644 --- a/arch/x86/um/os-Linux/prctl.c +++ b/arch/x86/um/os-Linux/prctl.c @@ -6,7 +6,7 @@ #include <sys/ptrace.h> #include <asm/ptrace.h> -int os_arch_prctl(int pid, int code, unsigned long *addr) +int os_arch_prctl(int pid, int option, unsigned long *arg2) { - return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code); + return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option); } diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c new file mode 100644 index 000000000000..627d68836b16 --- /dev/null +++ b/arch/x86/um/syscalls_32.c @@ -0,0 +1,7 @@ +#include <linux/syscalls.h> +#include <os.h> + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return -EINVAL; +} diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 10d907098c26..58f51667e2e4 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -7,13 +7,15 @@ #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> -long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) +long arch_prctl(struct task_struct *task, int option, + unsigned long __user *arg2) { - unsigned long *ptr = addr, tmp; + unsigned long *ptr = arg2, tmp; long ret; int pid = task->mm->context.id.u.pid; @@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) * arch_prctl is run on the host, then the registers are read * back. */ - switch (code) { + switch (option) { case ARCH_SET_FS: case ARCH_SET_GS: ret = restore_registers(pid, ¤t->thread.regs.regs); @@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ptr = &tmp; } - ret = os_arch_prctl(pid, code, ptr); + ret = os_arch_prctl(pid, option, ptr); if (ret) return ret; - switch (code) { + switch (option) { case ARCH_SET_FS: current->thread.arch.fs = (unsigned long) ptr; ret = save_registers(pid, ¤t->thread.regs.regs); @@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ret = save_registers(pid, ¤t->thread.regs.regs); break; case ARCH_GET_FS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; case ARCH_GET_GS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; } return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return arch_prctl(current, code, (unsigned long __user *) addr); + return arch_prctl(current, option, (unsigned long __user *) arg2); } void arch_switch_to(struct task_struct *to) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1e69956d7852..7a3089285c59 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, @@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, |