summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/boot/Makefile7
-rw-r--r--arch/x86/boot/compressed/Makefile3
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c2
-rw-r--r--arch/x86/events/Makefile1
-rw-r--r--arch/x86/events/amd/ibs.c37
-rw-r--r--arch/x86/events/amd/iommu.c5
-rw-r--r--arch/x86/events/amd/power.c353
-rw-r--r--arch/x86/events/core.c4
-rw-r--r--arch/x86/events/intel/cqm.c454
-rw-r--r--arch/x86/events/intel/ds.c5
-rw-r--r--arch/x86/events/intel/lbr.c2
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c7
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/ftrace.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h15
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/msr.h8
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/rwsem.h2
-rw-r--r--arch/x86/include/asm/special_insns.h16
-rw-r--r--arch/x86/include/asm/string_64.h4
-rw-r--r--arch/x86/include/asm/uaccess.h5
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h2
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/apb_timer.c2
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c88
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c5
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c20
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/fpu/regset.c2
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/ioport.c12
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kvm.c37
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/process_64.c14
-rw-r--r--arch/x86/kernel/smpboot.c27
-rw-r--r--arch/x86/kernel/tsc.c8
-rw-r--r--arch/x86/kvm/cpuid.c63
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/mmu.c106
-rw-r--r--arch/x86/kvm/mmu.h42
-rw-r--r--arch/x86/kvm/page_track.c9
-rw-r--r--arch/x86/kvm/paging_tmpl.h30
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx.c67
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/memcpy_64.S7
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/extable.c108
-rw-r--r--arch/x86/mm/mpx.c4
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bma023.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c2
-rw-r--r--arch/x86/purgatory/stack.S2
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/video/fbdev.c18
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/mmu.c8
-rw-r--r--arch/x86/xen/xen-head.S2
94 files changed, 1451 insertions, 358 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b680a5cb25b..2dc18605831f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
@@ -1209,6 +1210,15 @@ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
+config PERF_EVENTS_AMD_POWER
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ tristate "AMD Processor Power Reporting Mechanism"
+ ---help---
+ Provide power reporting mechanism support for AMD processors.
+ Currently, it leverages X86_FEATURE_ACC_POWER
+ (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+ average power consumption on Family 15h processors.
+
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
---help---
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0bf6749522d9..b1ef9e489084 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -12,6 +12,13 @@
KASAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT := n
+
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5e1d26e09407..6915ff2bd996 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,9 @@
KASAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index f9fb859c98b9..6874da5f67fc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -7,6 +7,9 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 1a50e09c945b..03c3eb77bfce 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static cycle_t vread_tsc(void)
/*
* GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
+ * predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index fdfea1511cc0..f59618a39990 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,7 @@
obj-y += core.o
obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o
obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 51087c29b2c2..3ea25c3917c0 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags)
hwc->state = 0;
perf_ibs_set_period(perf_ibs, hwc, &period);
+ /*
+ * Set STARTED before enabling the hardware, such that
+ * a subsequent NMI must observe it. Then clear STOPPING
+ * such that we don't consume NMIs by accident.
+ */
set_bit(IBS_STARTED, pcpu->state);
+ clear_bit(IBS_STOPPING, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
@@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
u64 config;
int stopping;
- stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+ stopping = test_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
return;
@@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
rdmsrl(hwc->config_base, config);
if (stopping) {
+ /*
+ * Set STOPPING before disabling the hardware, such that it
+ * must be visible to NMIs the moment we clear the EN bit,
+ * at which point we can generate an !VALID sample which
+ * we need to consume.
+ */
set_bit(IBS_STOPPING, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
+ /*
+ * Clear STARTED after disabling the hardware; if it were
+ * cleared before an NMI hitting after the clear but before
+ * clearing the EN bit might think it a spurious NMI and not
+ * handle it.
+ *
+ * Clearing it after, however, creates the problem of the NMI
+ * handler seeing STARTED but not having a valid sample.
+ */
+ clear_bit(IBS_STARTED, pcpu->state);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
@@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
u64 *buf, *config, period;
if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
/*
* Catch spurious interrupts after stopping IBS: After
* disabling IBS there could be still incoming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
- return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ if (test_and_clear_bit(IBS_STOPPING, pcpu->state))
+ return 1;
+
+ return 0;
}
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
if (!(*buf++ & perf_ibs->valid_mask))
- return 0;
+ goto fail;
config = &ibs_data.regs[0];
perf_ibs_event_update(perf_ibs, event, config);
@@ -599,7 +625,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle)
- perf_ibs_disable_event(perf_ibs, hwc, *config);
+ perf_ibs_stop(event, 0);
else
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
@@ -611,6 +637,7 @@ out:
static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ u64 stamp = sched_clock();
int handled = 0;
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
@@ -619,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
if (handled)
inc_irq_stat(apic_perf_irqs);
+ perf_sample_event_took(sched_clock() - stamp);
+
return handled;
}
NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 635e5eba0caf..40625ca7a190 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
+ AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"),
+ AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"),
+ AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"),
{ /* end: all zeroes */ },
};
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000000..55a3529dbf12
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,353 @@
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PTSC 0xc0010280
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK 0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG 1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+ u64 delta, tdelta;
+
+ prev_pwr_acc = hwc->pwr_acc;
+ prev_ptsc = hwc->ptsc;
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+ rdmsrl(MSR_F15H_PTSC, new_ptsc);
+
+ /*
+ * Calculate the CU power consumption over a time period, the unit of
+ * final value (delta) is micro-Watts. Then add it to the event count.
+ */
+ if (new_pwr_acc < prev_pwr_acc) {
+ delta = max_cu_acc_power + new_pwr_acc;
+ delta -= prev_pwr_acc;
+ } else
+ delta = new_pwr_acc - prev_pwr_acc;
+
+ delta *= cpu_pwr_sample_ratio * 1000;
+ tdelta = new_ptsc - prev_ptsc;
+
+ do_div(delta, tdelta);
+ local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+ __pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Mark event as deactivated and stopped. */
+ if (!(hwc->state & PERF_HES_STOPPED))
+ hwc->state |= PERF_HES_STOPPED;
+
+ /* Check if software counter update is necessary. */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of an event
+ * that we are disabling:
+ */
+ event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __pmu_event_start(event);
+
+ return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+ pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+ /* Only look at AMD power events. */
+ if (event->attr.type != pmu_class.type)
+ return -ENOENT;
+
+ /* Unsupported modes and filters. */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ /* no sampling */
+ event->attr.sample_period)
+ return -EINVAL;
+
+ if (cfg != AMD_POWER_EVENTSEL_PKG)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+ event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+ .attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(power_pkg),
+ EVENT_PTR(power_pkg_unit),
+ EVENT_PTR(power_pkg_scale),
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &pmu_attr_group,
+ &pmu_format_group,
+ &pmu_events_group,
+ NULL,
+};
+
+static struct pmu pmu_class = {
+ .attr_groups = attr_groups,
+ /* system-wide only */
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = pmu_event_init,
+ .add = pmu_event_add,
+ .del = pmu_event_del,
+ .start = pmu_event_start,
+ .stop = pmu_event_stop,
+ .read = pmu_event_read,
+};
+
+static void power_cpu_exit(int cpu)
+{
+ int target;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+ return;
+
+ /*
+ * Find a new CPU on the same compute unit, if was set in cpumask
+ * and still some CPUs on compute unit. Then migrate event and
+ * context to new CPU.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target < nr_cpumask_bits) {
+ cpumask_set_cpu(target, &cpu_mask);
+ perf_pmu_migrate_context(&pmu_class, cpu, target);
+ }
+}
+
+static void power_cpu_init(int cpu)
+{
+ int target;
+
+ /*
+ * 1) If any CPU is set at cpu_mask in the same compute unit, do
+ * nothing.
+ * 2) If no CPU is set at cpu_mask in the same compute unit,
+ * set current STARTING CPU.
+ *
+ * Note: if there is a CPU aside of the new one already in the
+ * sibling mask, then it is also in cpu_mask.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target >= nr_cpumask_bits)
+ cpumask_set_cpu(cpu, &cpu_mask);
+}
+
+static int
+power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
+ case CPU_STARTING:
+ power_cpu_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ power_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block power_cpu_notifier_nb = {
+ .notifier_call = power_cpu_notifier,
+ .priority = CPU_PRI_PERF,
+};
+
+static const struct x86_cpu_id cpu_match[] = {
+ { .vendor = X86_VENDOR_AMD, .family = 0x15 },
+ {},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+ int cpu, target, ret;
+
+ if (!x86_match_cpu(cpu_match))
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+ return -ENODEV;
+
+ cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+ if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+ pr_err("Failed to read max compute unit power accumulator MSR\n");
+ return -ENODEV;
+ }
+
+ cpu_notifier_register_begin();
+
+ /* Choose one online core of each compute unit. */
+ for_each_online_cpu(cpu) {
+ target = cpumask_first(topology_sibling_cpumask(cpu));
+ if (!cpumask_test_cpu(target, &cpu_mask))
+ cpumask_set_cpu(target, &cpu_mask);
+ }
+
+ ret = perf_pmu_register(&pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_warn("AMD Power PMU registration failed\n");
+ goto out;
+ }
+
+ __register_cpu_notifier(&power_cpu_notifier_nb);
+
+ pr_info("AMD Power PMU detected\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&power_cpu_notifier_nb);
+ cpu_notifier_register_done();
+
+ perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9b6ad08aa51a..041e442a3e28 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
return new;
}
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
- char *page)
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct perf_pmu_events_attr *pmu_attr = \
container_of(attr, struct perf_pmu_events_attr, attr);
@@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
return x86_pmu.events_sysfs_show(page, config);
}
+EXPORT_SYMBOL_GPL(events_sysfs_show);
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 93cb412a5579..7b5fd811ef45 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -13,8 +13,16 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+#define MBM_CNTR_WIDTH 24
+/*
+ * Guaranteed time in ms as per SDM where MBM counters will not overflow.
+ */
+#define MBM_CTR_OVERFLOW_TIME 1000
+
static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+static bool cqm_enabled, mbm_enabled;
+unsigned int mbm_socket_max;
/**
* struct intel_pqr_state - State cache for the PQR MSR
@@ -42,8 +50,37 @@ struct intel_pqr_state {
* interrupts disabled, which is sufficient for the protection.
*/
static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static struct hrtimer *mbm_timers;
+/**
+ * struct sample - mbm event's (local or total) data
+ * @total_bytes #bytes since we began monitoring
+ * @prev_msr previous value of MSR
+ */
+struct sample {
+ u64 total_bytes;
+ u64 prev_msr;
+};
/*
+ * samples profiled for total memory bandwidth type events
+ */
+static struct sample *mbm_total;
+/*
+ * samples profiled for local memory bandwidth type events
+ */
+static struct sample *mbm_local;
+
+#define pkg_id topology_physical_package_id(smp_processor_id())
+/*
+ * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
+ * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
+ * rmids per socket, an example is given below
+ * RMID1 of Socket0: vrmid = 1
+ * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
+ * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
+ */
+#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
+/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
* Also protects event->hw.cqm_rmid
*
@@ -65,9 +102,13 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
-#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
-
-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_MBM_LOCAL_EVENT_ID 0x03
/*
* This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
@@ -211,6 +252,21 @@ static void __put_rmid(u32 rmid)
list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
+static void cqm_cleanup(void)
+{
+ int i;
+
+ if (!cqm_rmid_ptrs)
+ return;
+
+ for (i = 0; i < cqm_max_rmid; i++)
+ kfree(cqm_rmid_ptrs[i]);
+
+ kfree(cqm_rmid_ptrs);
+ cqm_rmid_ptrs = NULL;
+ cqm_enabled = false;
+}
+
static int intel_cqm_setup_rmid_cache(void)
{
struct cqm_rmid_entry *entry;
@@ -218,7 +274,7 @@ static int intel_cqm_setup_rmid_cache(void)
int r = 0;
nr_rmids = cqm_max_rmid + 1;
- cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
nr_rmids, GFP_KERNEL);
if (!cqm_rmid_ptrs)
return -ENOMEM;
@@ -249,11 +305,9 @@ static int intel_cqm_setup_rmid_cache(void)
mutex_unlock(&cache_mutex);
return 0;
-fail:
- while (r--)
- kfree(cqm_rmid_ptrs[r]);
- kfree(cqm_rmid_ptrs);
+fail:
+ cqm_cleanup();
return -ENOMEM;
}
@@ -281,9 +335,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
/*
* Events that target same task are placed into the same cache group.
+ * Mark it as a multi event group, so that we update ->count
+ * for every event rather than just the group leader later.
*/
- if (a->hw.target == b->hw.target)
+ if (a->hw.target == b->hw.target) {
+ b->hw.is_group_event = true;
return true;
+ }
/*
* Are we an inherited event?
@@ -392,10 +450,26 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
struct rmid_read {
u32 rmid;
+ u32 evt_type;
atomic64_t value;
};
static void __intel_cqm_event_count(void *info);
+static void init_mbm_sample(u32 rmid, u32 evt_type);
+static void __intel_mbm_event_count(void *info);
+
+static bool is_mbm_event(int e)
+{
+ return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
+}
+
+static void cqm_mask_call(struct rmid_read *rr)
+{
+ if (is_mbm_event(rr->evt_type))
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
+ else
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
+}
/*
* Exchange the RMID of a group of events.
@@ -413,12 +487,12 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
*/
if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
struct rmid_read rr = {
- .value = ATOMIC64_INIT(0),
.rmid = old_rmid,
+ .evt_type = group->attr.config,
+ .value = ATOMIC64_INIT(0),
};
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
- &rr, 1);
+ cqm_mask_call(&rr);
local64_set(&group->count, atomic64_read(&rr.value));
}
@@ -430,6 +504,22 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
raw_spin_unlock_irq(&cache_lock);
+ /*
+ * If the allocation is for mbm, init the mbm stats.
+ * Need to check if each event in the group is mbm event
+ * because there could be multiple type of events in the same group.
+ */
+ if (__rmid_valid(rmid)) {
+ event = group;
+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
+
+ list_for_each_entry(event, head, hw.cqm_group_entry) {
+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
+ }
+ }
+
return old_rmid;
}
@@ -837,6 +927,72 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
schedule_delayed_work(&intel_cqm_rmid_work, delay);
}
+static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
+{
+ struct sample *mbm_current;
+ u32 vrmid = rmid_2_index(rmid);
+ u64 val, bytes, shift;
+ u32 eventid;
+
+ if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
+ mbm_current = &mbm_local[vrmid];
+ eventid = QOS_MBM_LOCAL_EVENT_ID;
+ } else {
+ mbm_current = &mbm_total[vrmid];
+ eventid = QOS_MBM_TOTAL_EVENT_ID;
+ }
+
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return mbm_current->total_bytes;
+
+ if (first) {
+ mbm_current->prev_msr = val;
+ mbm_current->total_bytes = 0;
+ return mbm_current->total_bytes;
+ }
+
+ /*
+ * The h/w guarantees that counters will not overflow
+ * so long as we poll them at least once per second.
+ */
+ shift = 64 - MBM_CNTR_WIDTH;
+ bytes = (val << shift) - (mbm_current->prev_msr << shift);
+ bytes >>= shift;
+
+ bytes *= cqm_l3_scale;
+
+ mbm_current->total_bytes += bytes;
+ mbm_current->prev_msr = val;
+
+ return mbm_current->total_bytes;
+}
+
+static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
+{
+ return update_sample(rmid, evt_type, 0);
+}
+
+static void __intel_mbm_event_init(void *info)
+{
+ struct rmid_read *rr = info;
+
+ update_sample(rr->rmid, rr->evt_type, 1);
+}
+
+static void init_mbm_sample(u32 rmid, u32 evt_type)
+{
+ struct rmid_read rr = {
+ .rmid = rmid,
+ .evt_type = evt_type,
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /* on each socket, init sample */
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
+}
+
/*
* Find a group and setup RMID.
*
@@ -849,6 +1005,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
bool conflict = false;
u32 rmid;
+ event->hw.is_group_event = false;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -856,6 +1013,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
/* All tasks in a group share an RMID */
event->hw.cqm_rmid = rmid;
*group = iter;
+ if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
+ init_mbm_sample(rmid, event->attr.config);
return;
}
@@ -872,6 +1031,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
else
rmid = __get_rmid();
+ if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
+ init_mbm_sample(rmid, event->attr.config);
+
event->hw.cqm_rmid = rmid;
}
@@ -893,7 +1055,10 @@ static void intel_cqm_event_read(struct perf_event *event)
if (!__rmid_valid(rmid))
goto out;
- val = __rmid_read(rmid);
+ if (is_mbm_event(event->attr.config))
+ val = rmid_read_mbm(rmid, event->attr.config);
+ else
+ val = __rmid_read(rmid);
/*
* Ignore this reading on error states and do not update the value.
@@ -924,10 +1089,100 @@ static inline bool cqm_group_leader(struct perf_event *event)
return !list_empty(&event->hw.cqm_groups_entry);
}
+static void __intel_mbm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = rmid_read_mbm(rr->rmid, rr->evt_type);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct perf_event *iter, *iter1;
+ int ret = HRTIMER_RESTART;
+ struct list_head *head;
+ unsigned long flags;
+ u32 grp_rmid;
+
+ /*
+ * Need to cache_lock as the timer Event Select MSR reads
+ * can race with the mbm/cqm count() and mbm_init() reads.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ if (list_empty(&cache_groups)) {
+ ret = HRTIMER_NORESTART;
+ goto out;
+ }
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ grp_rmid = iter->hw.cqm_rmid;
+ if (!__rmid_valid(grp_rmid))
+ continue;
+ if (is_mbm_event(iter->attr.config))
+ update_sample(grp_rmid, iter->attr.config, 0);
+
+ head = &iter->hw.cqm_group_entry;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry(iter1, head, hw.cqm_group_entry) {
+ if (!iter1->hw.is_group_event)
+ break;
+ if (is_mbm_event(iter1->attr.config))
+ update_sample(iter1->hw.cqm_rmid,
+ iter1->attr.config, 0);
+ }
+ }
+
+ hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return ret;
+}
+
+static void __mbm_start_timer(void *info)
+{
+ hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void __mbm_stop_timer(void *info)
+{
+ hrtimer_cancel(&mbm_timers[pkg_id]);
+}
+
+static void mbm_start_timers(void)
+{
+ on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
+}
+
+static void mbm_stop_timers(void)
+{
+ on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
+}
+
+static void mbm_hrtimer_init(void)
+{
+ struct hrtimer *hr;
+ int i;
+
+ for (i = 0; i < mbm_socket_max; i++) {
+ hr = &mbm_timers[i];
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = mbm_hrtimer_handle;
+ }
+}
+
static u64 intel_cqm_event_count(struct perf_event *event)
{
unsigned long flags;
struct rmid_read rr = {
+ .evt_type = event->attr.config,
.value = ATOMIC64_INIT(0),
};
@@ -940,7 +1195,9 @@ static u64 intel_cqm_event_count(struct perf_event *event)
return __perf_event_count(event);
/*
- * Only the group leader gets to report values. This stops us
+ * Only the group leader gets to report values except in case of
+ * multiple events in the same group, we still need to read the
+ * other events.This stops us
* reporting duplicate values to userspace, and gives us a clear
* rule for which task gets to report the values.
*
@@ -948,7 +1205,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
* specific packages - we forfeit that ability when we create
* task events.
*/
- if (!cqm_group_leader(event))
+ if (!cqm_group_leader(event) && !event->hw.is_group_event)
return 0;
/*
@@ -975,7 +1232,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ cqm_mask_call(&rr);
raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -1046,8 +1303,14 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
static void intel_cqm_event_destroy(struct perf_event *event)
{
struct perf_event *group_other = NULL;
+ unsigned long flags;
mutex_lock(&cache_mutex);
+ /*
+ * Hold the cache_lock as mbm timer handlers could be
+ * scanning the list of events.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
/*
* If there's another event in this group...
@@ -1079,6 +1342,14 @@ static void intel_cqm_event_destroy(struct perf_event *event)
}
}
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ /*
+ * Stop the mbm overflow timers when the last event is destroyed.
+ */
+ if (mbm_enabled && list_empty(&cache_groups))
+ mbm_stop_timers();
+
mutex_unlock(&cache_mutex);
}
@@ -1086,11 +1357,13 @@ static int intel_cqm_event_init(struct perf_event *event)
{
struct perf_event *group = NULL;
bool rotate = false;
+ unsigned long flags;
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;
- if (event->attr.config & ~QOS_EVENT_MASK)
+ if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
+ (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;
/* unsupported modes and filters */
@@ -1110,9 +1383,21 @@ static int intel_cqm_event_init(struct perf_event *event)
mutex_lock(&cache_mutex);
+ /*
+ * Start the mbm overflow timers when the first event is created.
+ */
+ if (mbm_enabled && list_empty(&cache_groups))
+ mbm_start_timers();
+
/* Will also set rmid */
intel_cqm_setup_event(event, &group);
+ /*
+ * Hold the cache_lock as mbm timer handlers be
+ * scanning the list of events.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
if (group) {
list_add_tail(&event->hw.cqm_group_entry,
&group->hw.cqm_group_entry);
@@ -1131,6 +1416,7 @@ static int intel_cqm_event_init(struct perf_event *event)
rotate = true;
}
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
mutex_unlock(&cache_mutex);
if (rotate)
@@ -1145,6 +1431,16 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
+EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
+EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
+EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
+
+EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
+EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
+EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
+EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
+
static struct attribute *intel_cqm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
EVENT_PTR(intel_cqm_llc_pkg),
@@ -1154,9 +1450,38 @@ static struct attribute *intel_cqm_events_attr[] = {
NULL,
};
+static struct attribute *intel_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_total_bytes),
+ EVENT_PTR(intel_cqm_local_bytes),
+ EVENT_PTR(intel_cqm_total_bytes_pkg),
+ EVENT_PTR(intel_cqm_local_bytes_pkg),
+ EVENT_PTR(intel_cqm_total_bytes_unit),
+ EVENT_PTR(intel_cqm_local_bytes_unit),
+ EVENT_PTR(intel_cqm_total_bytes_scale),
+ EVENT_PTR(intel_cqm_local_bytes_scale),
+ NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_total_bytes),
+ EVENT_PTR(intel_cqm_local_bytes),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_total_bytes_pkg),
+ EVENT_PTR(intel_cqm_local_bytes_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_total_bytes_unit),
+ EVENT_PTR(intel_cqm_local_bytes_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_total_bytes_scale),
+ EVENT_PTR(intel_cqm_local_bytes_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
static struct attribute_group intel_cqm_events_group = {
.name = "events",
- .attrs = intel_cqm_events_attr,
+ .attrs = NULL,
};
PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1303,12 +1628,70 @@ static const struct x86_cpu_id intel_cqm_match[] = {
{}
};
+static void mbm_cleanup(void)
+{
+ if (!mbm_enabled)
+ return;
+
+ kfree(mbm_local);
+ kfree(mbm_total);
+ mbm_enabled = false;
+}
+
+static const struct x86_cpu_id intel_mbm_local_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+ {}
+};
+
+static const struct x86_cpu_id intel_mbm_total_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
+ {}
+};
+
+static int intel_mbm_init(void)
+{
+ int ret = 0, array_size, maxid = cqm_max_rmid + 1;
+
+ mbm_socket_max = topology_max_packages();
+ array_size = sizeof(struct sample) * maxid * mbm_socket_max;
+ mbm_local = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_local)
+ return -ENOMEM;
+
+ mbm_total = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_total) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ array_size = sizeof(struct hrtimer) * mbm_socket_max;
+ mbm_timers = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_timers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ mbm_hrtimer_init();
+
+out:
+ if (ret)
+ mbm_cleanup();
+
+ return ret;
+}
+
static int __init intel_cqm_init(void)
{
- char *str, scale[20];
+ char *str = NULL, scale[20];
int i, cpu, ret;
- if (!x86_match_cpu(intel_cqm_match))
+ if (x86_match_cpu(intel_cqm_match))
+ cqm_enabled = true;
+
+ if (x86_match_cpu(intel_mbm_local_match) &&
+ x86_match_cpu(intel_mbm_total_match))
+ mbm_enabled = true;
+
+ if (!cqm_enabled && !mbm_enabled)
return -ENODEV;
cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
@@ -1365,16 +1748,41 @@ static int __init intel_cqm_init(void)
cqm_pick_event_reader(i);
}
- __perf_cpu_notifier(intel_cqm_cpu_notifier);
+ if (mbm_enabled)
+ ret = intel_mbm_init();
+ if (ret && !cqm_enabled)
+ goto out;
+
+ if (cqm_enabled && mbm_enabled)
+ intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
+ else if (!cqm_enabled && mbm_enabled)
+ intel_cqm_events_group.attrs = intel_mbm_events_attr;
+ else if (cqm_enabled && !mbm_enabled)
+ intel_cqm_events_group.attrs = intel_cqm_events_attr;
ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
- if (ret)
+ if (ret) {
pr_err("Intel CQM perf registration failed: %d\n", ret);
- else
+ goto out;
+ }
+
+ if (cqm_enabled)
pr_info("Intel CQM monitoring enabled\n");
+ if (mbm_enabled)
+ pr_info("Intel MBM enabled\n");
+ /*
+ * Register the hot cpu notifier once we are sure cqm
+ * is enabled to avoid notifier leak.
+ */
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
out:
cpu_notifier_register_done();
+ if (ret) {
+ kfree(str);
+ cqm_cleanup();
+ mbm_cleanup();
+ }
return ret;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ce7211a07c0b..8584b90d8e0b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void)
* We will overwrite the from and to address before we output
* the sample.
*/
+ rcu_read_lock();
perf_prepare_sample(&header, &data, event, &regs);
if (perf_output_begin(&handle, event, header.size *
(top - base - skip)))
- return 1;
+ goto unlock;
for (at = base; at < top; at++) {
/* Filter out any records that contain kernel addresses. */
@@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void)
/* There's new data available. */
event->hw.interrupts++;
event->pending_kill = POLL_IN;
+unlock:
+ rcu_read_unlock();
return 1;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 69dd11887dd1..6c3b7c1780c9 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -649,7 +649,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
+ * instruction is not necessarily a branch (in case of interrupt).
*
* The branch type returned also includes the priv level of the
* target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index b834a3f55a01..70c93f9b03ac 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void)
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
case 63: /* Haswell-Server */
+ case 79: /* Broadwell-Server */
apply_quirk = true;
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
@@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void)
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
case 61: /* Broadwell */
+ case 71: /* Broadwell-H */
rapl_cntr_mask = RAPL_IDX_HSW;
rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
break;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 93f6bd9bf761..ab2bcaaebe38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -46,7 +46,6 @@
(SNBEP_PMON_CTL_EV_SEL_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PMON_CTL_INVERT | \
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -148,7 +147,6 @@
/* IVBEP PCU */
#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
(SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
@@ -258,7 +256,6 @@
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
SNBEP_CBO_PMON_CTL_TID_EN | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PMON_CTL_INVERT | \
KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
};
static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
+ &format_attr_event.attr,
&format_attr_occ_sel.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
@@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
};
static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
+ &format_attr_event.attr,
&format_attr_occ_sel.attr,
&format_attr_edge.attr,
&format_attr_thresh5.attr,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 68155cafa8a1..ba6ef18528c9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -272,7 +272,7 @@ struct cpu_hw_events {
* events to select for counter rescheduling.
*
* Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
+ * will increase scheduling cycles for an over-committed system
* dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
* and its counter masks must be kept at a minimum.
*/
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 0899cfc8dfe8..98f25bbafac4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -643,8 +643,8 @@ static inline void entering_irq(void)
static inline void entering_ack_irq(void)
{
- ack_APIC_irq();
entering_irq();
+ ack_APIC_irq();
}
static inline void ipi_entering_ack_irq(void)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index acdee09228b3..ebb102e1bbc7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -316,9 +316,10 @@ static inline bool is_x32_task(void)
return false;
}
-static inline bool is_compat_task(void)
+static inline bool in_compat_syscall(void)
{
return is_ia32_task() || is_x32_task();
}
+#define in_compat_syscall in_compat_syscall /* override the generic impl */
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3d1a84383162..8f9afefd2dc5 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -94,7 +94,7 @@
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
@@ -245,6 +245,8 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 24938852db30..a4820d4df617 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -52,13 +52,13 @@ int ftrace_int3_handler(struct pt_regs *regs);
* this screws up the trace output when tracing a ia32 task.
* Instead of reporting bogus syscalls, just do not trace them.
*
- * If the user realy wants these, then they should use the
+ * If the user really wants these, then they should use the
* raw syscall tracepoints with filtering.
*/
#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
{
- if (is_compat_task())
+ if (in_compat_syscall())
return true;
return false;
}
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1815b736269d..b90e1053049b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -141,6 +141,7 @@ struct irq_alloc_info {
struct irq_cfg {
unsigned int dest_apicid;
u8 vector;
+ u8 old_vector;
};
extern struct irq_cfg *irq_cfg(unsigned int irq);
@@ -168,20 +169,6 @@ extern atomic_t irq_mis_count;
extern void elcr_set_level_irq(unsigned int irq);
-/* SMP */
-extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
-extern __visible void smp_spurious_interrupt(struct pt_regs *);
-extern __visible void smp_x86_platform_ipi(struct pt_regs *);
-extern __visible void smp_error_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_IO_APIC
-extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
-#endif
-#ifdef CONFIG_SMP
-extern __visible void smp_reschedule_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-#endif
-
extern char irq_entries_start[];
#ifdef CONFIG_TRACING
#define trace_irq_entries_start irq_entries_start
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c8b501cb6d..f62a9f37f79f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,7 +84,8 @@
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
+ | X86_CR4_PKE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -187,12 +188,14 @@ enum {
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -335,6 +338,14 @@ struct kvm_mmu {
*/
u8 permissions[16];
+ /*
+ * The pkru_mask indicates if protection key checks are needed. It
+ * consists of 16 domains indexed by page fault error code bits [4:1],
+ * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+ * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+ */
+ u32 pkru_mask;
+
u64 *pae_root;
u64 *lm_root;
@@ -874,6 +885,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ u32 (*get_pkru)(struct kvm_vcpu *vcpu);
void (*fpu_activate)(struct kvm_vcpu *vcpu);
void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 93fb7c1cffda..7a79ee2778b3 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -42,14 +42,6 @@ struct saved_msrs {
struct saved_msr *array;
};
-static inline unsigned long long native_read_tscp(unsigned int *aux)
-{
- unsigned long low, high;
- asm volatile(".byte 0x0f,0x01,0xf9"
- : "=a" (low), "=d" (high), "=c" (*aux));
- return low | ((u64)high << 32);
-}
-
/*
* both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
* constraint has different meanings. For i386, "A" means exactly
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1ff49ec29ece..97f3242e133c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -107,6 +107,12 @@ static inline u32 read_pkru(void)
return 0;
}
+static inline void write_pkru(u32 pkru)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ __write_pkru(pkru);
+}
+
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index cad82c9c2fde..ceec86eb68e9 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -25,7 +25,7 @@
* This should be totally fair - if anything is waiting, a process that wants a
* lock will go to the back of the queue. When the currently active lock is
* released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consequtive readers at the
+ * that will be woken up; if there's a bunch of consecutive readers at the
* front, then they'll all be woken up, but no other readers will be.
*/
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index aee6e76e561e..d96d04377765 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -113,11 +113,27 @@ static inline u32 __read_pkru(void)
: "c" (ecx));
return pkru;
}
+
+static inline void __write_pkru(u32 pkru)
+{
+ u32 ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (pkru), "c"(ecx), "d"(edx));
+}
#else
static inline u32 __read_pkru(void)
{
return 0;
}
+
+static inline void __write_pkru(u32 pkru)
+{
+}
#endif
static inline void native_wbinvd(void)
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ca6ba3607705..90dbbd9666d4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -87,9 +87,9 @@ int strcmp(const char *cs, const char *ct);
*
* Low level memory copy function that catches machine checks
*
- * Return true for success, false for fail
+ * Return 0 for success, -EFAULT for fail
*/
-bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 88bff6dd23ad..a969ae607be8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -105,9 +105,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
struct exception_table_entry {
int insn, fixup, handler;
};
-/* This is not the generic standard exception_table_entry format */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+
+#define ARCH_HAS_RELATIVE_EXTABLE
extern int fixup_exception(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 8b2d4bea9962..39171b3646bb 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
+extern void xen_set_iopl_mask(unsigned mask);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d5fb0871aba3..adaae2c781c1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
+
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e75907601a41..8c2f1ef6ca23 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -956,7 +956,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
@@ -984,7 +984,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 222a57076039..cefacbad1531 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -221,7 +221,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
unsigned long cpu = (unsigned long)hcpu;
struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
- switch (action & 0xf) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DEAD:
dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o
obj-y += hw_nmi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 531b9611c51d..d356987a04e9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1611,7 +1611,7 @@ void __init enable_IR_x2apic(void)
legacy_pic->mask_all();
mask_ioapic_entries();
- /* If irq_remapping_prepare() succeded, try to enable it */
+ /* If irq_remapping_prepare() succeeded, try to enable it */
if (ir_stat >= 0)
ir_stat = try_to_enable_IR();
/* ir_stat contains the remap mode or an error code */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3b670df4ba7b..ad59d70bcb1a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -213,6 +213,7 @@ update:
*/
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
d->move_in_progress = !cpumask_empty(d->old_domain);
+ d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
success:
@@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg)
}
/*
- * Called with @desc->lock held and interrupts disabled.
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
*/
void irq_force_complete_move(struct irq_desc *desc)
{
struct irq_data *irqdata = irq_desc_get_irq_data(desc);
struct apic_chip_data *data = apic_chip_data(irqdata);
struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ unsigned int cpu;
if (!cfg)
return;
- __irq_complete_move(cfg, cfg->vector);
-
/*
* This is tricky. If the cleanup of @data->old_domain has not been
* done yet, then the following setaffinity call will fail with
* -EBUSY. This can leave the interrupt in a stale state.
*
- * The cleanup cannot make progress because we hold @desc->lock. So in
- * case @data->old_domain is not yet cleaned up, we need to drop the
- * lock and acquire it again. @desc cannot go away, because the
- * hotplug code holds the sparse irq lock.
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
*/
raw_spin_lock(&vector_lock);
- /* Clean out all offline cpus (including ourself) first. */
+ /*
+ * Clean out all offline cpus (including the outgoing one) from the
+ * old_domain mask.
+ */
cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
- while (!cpumask_empty(data->old_domain)) {
+
+ /*
+ * If move_in_progress is cleared and the old_domain mask is empty,
+ * then there is nothing to cleanup. fixup_irqs() will take care of
+ * the stale vectors on the outgoing cpu.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
raw_spin_unlock(&vector_lock);
- raw_spin_unlock(&desc->lock);
- cpu_relax();
- raw_spin_lock(&desc->lock);
+ return;
+ }
+
+ /*
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (data->move_in_progress) {
/*
- * Reevaluate apic_chip_data. It might have been cleared after
- * we dropped @desc->lock.
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
*/
- data = apic_chip_data(irqdata);
- if (!data)
- return;
- raw_spin_lock(&vector_lock);
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqdata->irq, cfg->old_vector);
}
+ /*
+ * If old_domain is not empty, then other cpus still have the irq
+ * descriptor set in their vector array. Clean it up.
+ */
+ for_each_cpu(cpu, data->old_domain)
+ per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+
+ /* Cleanup the left overs of the (half finished) move */
+ cpumask_clear(data->old_domain);
+ data->move_in_progress = 0;
raw_spin_unlock(&vector_lock);
}
#endif
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 624db00583f4..8f4942e2bcbb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -792,7 +792,8 @@ static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
{
long cpu = (long)hcpu;
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
case CPU_ONLINE:
uv_heartbeat_enable(cpu);
break;
@@ -860,7 +861,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
*/
void uv_cpu_init(void)
{
- /* CPU 0 initilization will be done via uv_system_init. */
+ /* CPU 0 initialization will be done via uv_system_init. */
if (!uv_blade_info)
return;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 052c9c3026cc..9307f182fe30 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1088,7 +1088,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
* @device: identity of device
* @enable: on/off
*
- * Activate or deactive power management on either a specific device
+ * Activate or deactivate power management on either a specific device
* or the entire system (%APM_DEVICE_ALL).
*/
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0d373d7affc8..4a8697f7d4ef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5026a13356c4..6e47e3a916f1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -85,7 +85,7 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
@@ -309,7 +309,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- nodes_per_socket = ((ecx >> 8) & 7) + 1;
node_id = ecx & 7;
/* get compute unit information */
@@ -320,7 +319,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes_per_socket = ((value >> 3) & 7) + 1;
node_id = value & 7;
} else
return;
@@ -522,6 +520,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ u32 ecx;
+
+ ecx = cpuid_ecx(0x8000001e);
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes_per_socket = ((value >> 3) & 7) + 1;
+ }
}
static void early_init_amd(struct cpuinfo_x86 *c)
@@ -539,6 +549,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_sched_clock_stable();
}
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 06ad72383b4e..8394b3d1f94f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -692,7 +692,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_1_EDX] = edx;
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
+ ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
+ (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
@@ -968,7 +970,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_identify)
this_cpu->c_identify(c);
- /* Clear/Set all flags overriden by options, after probe */
+ /* Clear/Set all flags overridden by options, after probe */
for (i = 0; i < NCAPINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
@@ -1028,7 +1030,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_pku(c);
/*
- * Clear/Set all flags overriden by options, need do it
+ * Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
for (i = 0; i < NCAPINTS; i++) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fcbcb2f678ca..19f57360dfd2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(mtrr_state);
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 21bf92490a7b..8a121991e5ba 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -287,7 +287,7 @@ static __init void early_pci_serial_init(char *s)
}
/*
- * Lastly, initalize the hardware
+ * Lastly, initialize the hardware
*/
if (*s) {
if (strcmp(s, "nocfg") == 0)
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 0bc3490420c5..8bd1c003942a 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,7 +8,7 @@
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
* as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
+ * capabilities supported by the xsave.
*/
int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 702547ce33c9..d036cfb4495d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1,5 +1,5 @@
/*
- * Code for replacing ftrace calls with jumps.
+ * Dynamic function tracing support.
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index be0ebbb6d1d1..a1f0e4a5c47e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -717,7 +717,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
struct hpet_work_struct work;
struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
- switch (action & 0xf) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
init_completion(&work.complete);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 37dae792dbbe..589b3193f102 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
struct pt_regs *regs = current_pt_regs();
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
@@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 0f8a6bbaaa44..2af478e3fd4e 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -271,7 +271,7 @@ static int bzImage64_probe(const char *buf, unsigned long len)
int ret = -ENOEXEC;
struct setup_header *header;
- /* kernel should be atleast two sectors long */
+ /* kernel should be at least two sectors long */
if (len < 2 * 512) {
pr_err("File is too short to be a bzImage\n");
return ret;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ed15cd486d06..2da6ee9ae69b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -609,9 +609,9 @@ static struct notifier_block kgdb_notifier = {
};
/**
- * kgdb_arch_init - Perform any architecture specific initalization.
+ * kgdb_arch_init - Perform any architecture specific initialization.
*
- * This function will handle the initalization of any architecture
+ * This function will handle the initialization of any architecture
* specific callbacks.
*/
int kgdb_arch_init(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 47190bd399e7..807950860fb7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -36,6 +36,7 @@
#include <linux/kprobes.h>
#include <linux/debugfs.h>
#include <linux/nmi.h>
+#include <linux/swait.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -91,14 +92,14 @@ static void kvm_io_delay(void)
struct kvm_task_sleep_node {
struct hlist_node link;
- wait_queue_head_t wq;
+ struct swait_queue_head wq;
u32 token;
int cpu;
bool halted;
};
static struct kvm_task_sleep_head {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
@@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
- DEFINE_WAIT(wait);
+ DECLARE_SWAITQUEUE(wait);
rcu_irq_enter();
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
kfree(e);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
rcu_irq_exit();
return;
@@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token;
n.cpu = smp_processor_id();
n.halted = is_idle_task(current) || preempt_count() > 1;
- init_waitqueue_head(&n.wq);
+ init_swait_queue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
for (;;) {
if (!n.halted)
- prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
@@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
}
}
if (!n.halted)
- finish_wait(&n.wq, &wait);
+ finish_swait(&n.wq, &wait);
rcu_irq_exit();
return;
@@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
hlist_del_init(&n->link);
if (n->halted)
smp_send_reschedule(n->cpu);
- else if (waitqueue_active(&n->wq))
- wake_up(&n->wq);
+ else if (swait_active(&n->wq))
+ swake_up(&n->wq);
}
static void apf_task_wake_all(void)
@@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
struct kvm_task_sleep_node *n =
hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
}
}
@@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
}
again:
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
n = _find_apf_task(b, token);
if (!n) {
/*
@@ -225,17 +226,17 @@ again:
* Allocation failed! Busy wait while other cpu
* handles async PF.
*/
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
cpu_relax();
goto again;
}
n->token = token;
n->cpu = smp_processor_id();
- init_waitqueue_head(&n->wq);
+ init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
apf_task_wake_one(n);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
paravirt_ops_setup();
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
- spin_lock_init(&async_pf_sleepers[i].lock);
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 72cef58693c7..1d39bfbd26bb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -226,7 +226,7 @@ static void kvm_setup_secondary_clock(void)
* registered memory location. If the guest happens to shutdown, this memory
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writting anything
+ * kind of shutdown from our side, we unregister the clock by writing anything
* that does not have the 'enable' bit set in the msr
*/
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 776229e98202..6cbab31ac23a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
asmlinkage extern void ret_from_fork(void);
@@ -413,6 +414,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#ifdef CONFIG_XEN
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
@@ -478,7 +490,7 @@ void set_personality_ia32(bool x32)
if (current->mm)
current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
- /* is_compat_task() uses the presence of the x32
+ /* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
current_thread_info()->status &= ~TS_COMPAT;
} else {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 643dbdccf4bc..b2c99f811c3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -274,11 +274,6 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- if (pkg < __max_logical_packages) {
- set_bit(pkg, logical_package_map);
- physical_to_logical_pkg[pkg] = pkg;
- goto found;
- }
new = find_first_zero_bit(logical_package_map, __max_logical_packages);
if (new >= __max_logical_packages) {
physical_to_logical_pkg[pkg] = -1;
@@ -317,9 +312,27 @@ static void __init smp_init_package_map(void)
/*
* Today neither Intel nor AMD support heterogenous systems. That
* might change in the future....
+ *
+ * While ideally we'd want '* smp_num_siblings' in the below @ncpus
+ * computation, this won't actually work since some Intel BIOSes
+ * report inconsistent HT data when they disable HT.
+ *
+ * In particular, they reduce the APIC-IDs to only include the cores,
+ * but leave the CPUID topology to say there are (2) siblings.
+ * This means we don't know how many threads there will be until
+ * after the APIC enumeration.
+ *
+ * By not including this we'll sometimes over-estimate the number of
+ * logical packages by the amount of !present siblings, but this is
+ * still better than MAX_LOCAL_APIC.
+ *
+ * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited
+ * on the command line leading to a similar issue as the HT disable
+ * problem because the hyperthreads are usually enumerated after the
+ * primary cores.
*/
- ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
- __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+ ncpus = boot_cpu_data.x86_max_cores;
+ __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
/*
* Possibly larger than what we need as the number of apic ids per
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 56380440d862..c9c4c7ce3eb2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -881,7 +881,7 @@ void tsc_restore_sched_clock_state(void)
local_irq_save(flags);
/*
- * We're comming out of suspend, there's no concurrency yet; don't
+ * We're coming out of suspend, there's no concurrency yet; don't
* bother being nice about the RCU stuff, just write to both
* data fields.
*/
@@ -1306,11 +1306,15 @@ void __init tsc_init(void)
unsigned long calibrate_delay_is_known(void)
{
int sibling, cpu = smp_processor_id();
+ struct cpumask *mask = topology_core_cpumask(cpu);
if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
return 0;
- sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+ if (!mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
if (sibling < nr_cpu_ids)
return cpu_data(sibling).loops_per_jiffy;
return 0;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0029644bf09c..8efb839948e5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer_mode_mask = 1 << 17;
}
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (best) {
+ /* Update OSPKE bit */
+ if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+ best->ecx &= ~F(OSPKE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+ best->ecx |= F(OSPKE);
+ }
+ }
+
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
if (!best) {
vcpu->arch.guest_supported_xcr0 = 0;
@@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
/* cpuid 1.edx */
- const u32 kvm_supported_word0_x86_features =
+ const u32 kvm_cpuid_1_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
@@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
0 /* HTT, TM, Reserved, PBE */;
/* cpuid 0x80000001.edx */
- const u32 kvm_supported_word1_x86_features =
+ const u32 kvm_cpuid_8000_0001_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
@@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
- const u32 kvm_supported_word4_x86_features =
+ const u32 kvm_cpuid_1_ecx_x86_features =
/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
* but *not* advertised to guests via CPUID ! */
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
@@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
F(F16C) | F(RDRAND);
/* cpuid 0x80000001.ecx */
- const u32 kvm_supported_word6_x86_features =
+ const u32 kvm_cpuid_8000_0001_ecx_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
/* cpuid 0xC0000001.edx */
- const u32 kvm_supported_word5_x86_features =
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN);
/* cpuid 7.0.ebx */
- const u32 kvm_supported_word9_x86_features =
+ const u32 kvm_cpuid_7_0_ebx_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
- const u32 kvm_supported_word10_x86_features =
+ const u32 kvm_cpuid_D_1_eax_x86_features =
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, (u32)0xd);
break;
case 1:
- entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
- entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
+ entry->edx &= kvm_cpuid_1_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_1_EDX);
+ entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_1_ECX);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capability word 9 */
if (index == 0) {
- entry->ebx &= kvm_supported_word9_x86_features;
- cpuid_mask(&entry->ebx, 9);
+ entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
- } else
+ entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled)
+ entry->ecx &= ~F(PKU);
+ } else {
entry->ebx = 0;
+ entry->ecx = 0;
+ }
entry->eax = 0;
- entry->ecx = 0;
entry->edx = 0;
break;
}
@@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
do_cpuid_1_ent(&entry[i], function, idx);
if (idx == 1) {
- entry[i].eax &= kvm_supported_word10_x86_features;
+ entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
entry[i].ebx = 0;
if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
entry[i].ebx =
@@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, 0x8000001a);
break;
case 0x80000001:
- entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
- entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
+ entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+ entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
break;
case 0x80000007: /* Advanced power management */
/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, 0xC0000004);
break;
case 0xC0000001:
- entry->edx &= kvm_supported_word5_x86_features;
- cpuid_mask(&entry->edx, 5);
+ entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
break;
case 3: /* Processor serial number */
case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 66a6581724ad..e17a74b1d852 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4af75..762cdf2595f9 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
+static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->get_pkru(vcpu);
+}
+
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c512f095cdac..70e95d097ef1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -479,7 +479,7 @@ static bool spte_is_locklessly_modifiable(u64 spte)
static bool spte_has_volatile_bits(u64 spte)
{
/*
- * Always atomicly update spte if it can be updated
+ * Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
@@ -550,7 +550,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
/*
* For the spte updated out of mmu-lock is safe, since
- * we always atomicly update it, see the comments in
+ * we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
if (spte_is_locklessly_modifiable(old_spte) &&
@@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
* kvm_flush_remote_tlbs() IPI to all active vcpus.
*/
local_irq_disable();
- vcpu->mode = READING_SHADOW_PAGE_TABLES;
+
/*
* Make sure a following spte read is not reordered ahead of the write
* to vcpu->mode.
*/
- smp_mb();
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
* reads to sptes. If it does, kvm_commit_zap_page() can see us
* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
- smp_mb();
- vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
local_irq_enable();
}
@@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
return;
/*
- * wmb: make sure everyone sees our modifications to the page tables
- * rmb: make sure we see changes to vcpu->mode
- */
- smp_mb();
-
- /*
- * Wait for all vcpus to exit guest mode and/or lockless shadow
- * page table walks.
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
*/
kvm_flush_remote_tlbs(kvm);
@@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
}
}
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ bool ept)
+{
+ unsigned bit;
+ bool wp;
+
+ if (ept) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ wp = is_write_protection(vcpu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
unsigned root_level = mmu->root_level;
@@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
MMU_WARN_ON(!is_pae(vcpu));
@@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
context->page_fault = paging32_page_fault;
@@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
@@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
context->direct_map = false;
update_permission_bitmask(vcpu, context, true);
+ update_pkru_bitmask(vcpu, context, true);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, g_context, false);
+ update_pkru_bitmask(vcpu, g_context, false);
update_last_nonleaf_level(vcpu, g_context);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 58fe98a0a526..b70df72e2b33 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -10,10 +10,11 @@
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
@@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
}
/*
- * Will a fault with a given page-fault error code (pfec) cause a permission
- * fault with the given access (in ACC_* format)?
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
*/
-static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- unsigned pte_access, unsigned pfec)
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ unsigned pfec)
{
int cpl = kvm_x86_ops->get_cpl(vcpu);
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
int index = (pfec >> 1) +
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ bool fault = (mmu->permissions[index] >> pte_access) & 1;
+
+ WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+ pfec |= PFERR_PRESENT_MASK;
+
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = pfec - 1 +
+ ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
- WARN_ON(pfec & PFERR_RSVD_MASK);
+ pkru_bits &= mmu->pkru_mask >> offset;
+ pfec |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
- return (mmu->permissions[index] >> pte_access) & 1;
+ return -(uint32_t)fault & pfec;
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 11f76436f74f..b431539c3714 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
enum kvm_page_track_mode mode)
{
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+ struct kvm_memory_slot *slot;
+ int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot)
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e159a8185ad9..1d971c7553c3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
return 0;
}
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty;
+ unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
gpa_t pte_gpa;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
@@ -359,10 +370,10 @@ retry_walk:
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
- if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
- errcode |= PFERR_PRESENT_MASK;
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+ if (unlikely(errcode))
goto error;
- }
gfn = gpte_to_gfn_lvl(pte, walker->level);
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
@@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ /*
+ * Update spte before increasing tlbs_dirty to make
+ * sure no tlb flush is lost after spte is zapped; see
+ * the comments in kvm_flush_remote_tlbs().
+ */
+ smp_wmb();
vcpu->kvm->tlbs_dirty++;
continue;
}
@@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
+ /*
+ * The same as above where we are doing
+ * prefetch_invalid_gpte().
+ */
+ smp_wmb();
vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 95070386d599..31346a3f20a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+
+ .get_pkru = svm_get_pkru,
+
.fpu_activate = svm_fpu_activate,
.fpu_deactivate = svm_fpu_deactivate,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75173efccac5..ee1c8a93871c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
struct page *pml_pg;
u64 current_tsc_ratio;
+
+ bool guest_pkru_valid;
+ u32 guest_pkru;
+ u32 host_pkru;
};
enum segment_cache_field {
@@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
vmx_vcpu_pi_load(vcpu, cpu);
+ vmx->host_pkru = read_pkru();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->guest_pkru;
+}
+
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!enable_unrestricted_guest && !is_paging(vcpu))
/*
- * SMEP/SMAP is disabled if CPU is in non-paging mode in
- * hardware. However KVM always uses paging mode without
- * unrestricted guest.
- * To emulate this behavior, SMEP/SMAP needs to be manually
- * disabled when guest switches to non-paging mode.
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -5506,7 +5528,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
static void handle_clts(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -7245,7 +7267,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
/* The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the approriate number of
+ * bit (field_value), and then copies only the appropriate number of
* bits into the vmcs12 field.
*/
u64 field_value = 0;
@@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ /*
+ * Old versions of KVM use the single-context version so we
+ * have to support it; just treat it the same as all-context.
+ */
case VMX_VPID_EXTENT_ALL_CONTEXT:
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
nested_vmx_succeed(vcpu);
break;
default:
- /* Trap single context invalidation invvpid calls */
+ /* Trap individual address invalidation invvpid calls */
BUG_ON(1);
break;
}
@@ -8621,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ if (vmx->guest_pkru_valid)
+ __write_pkru(vmx->guest_pkru);
+
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -8761,6 +8793,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/*
+ * eager fpu is enabled if PKEY is supported and CR4 is switched
+ * back on host, so it is safe to read guest PKRU from current
+ * XSAVE.
+ */
+ if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+ vmx->guest_pkru = __read_pkru();
+ if (vmx->guest_pkru != vmx->host_pkru) {
+ vmx->guest_pkru_valid = true;
+ __write_pkru(vmx->host_pkru);
+ } else
+ vmx->guest_pkru_valid = false;
+ }
+
+ /*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
* we did not inject a still-pending event to L1 now because of
* nested_run_pending, we need to re-enable this bit.
@@ -10884,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+
+ .get_pkru = vmx_get_pkru,
+
.fpu_activate = vmx_fpu_activate,
.fpu_deactivate = vmx_fpu_deactivate,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7236bd3a4c3d..742d0f7d3556 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP;
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
if (cr4 & CR4_RESERVED_BITS)
return 1;
@@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
return 1;
+ if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+ return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
- if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
return 0;
@@ -1559,7 +1562,7 @@ static cycle_t read_tsc(void)
/*
* GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
+ * predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
@@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
+ /*
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
+ */
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.access, access)) {
+ vcpu->arch.access, 0, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- /* We should set ->mode before check ->requests,
- * see the comment in make_all_cpus_request.
+ /*
+ * We should set ->mode before check ->requests,
+ * Please see the comment in kvm_make_all_cpus_request.
+ * This also orders the write to mode from any reads
+ * to the page tables done while the VCPU is running.
+ * Please see the comment in kvm_flush_remote_tlbs.
*/
smp_mb__after_srcu_read_unlock();
@@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & X86_CR4_OSXSAVE)
+ if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 007940faa5c6..7ce3634ab5fe 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
- | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
extern u64 host_xcr0;
extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..72a576752a7e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
# Makefile for x86 specific library files.
#
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
+
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index cbb8ee5830ff..2ec0b0abbfaa 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,6 +1,7 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
+#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
@@ -268,16 +269,16 @@ ENTRY(memcpy_mcsafe)
decl %ecx
jnz .L_copy_trailing_bytes
- /* Copy successful. Return true */
+ /* Copy successful. Return zero */
.L_done_memcpy_trap:
xorq %rax, %rax
ret
ENDPROC(memcpy_mcsafe)
.section .fixup, "ax"
- /* Return false for any failure */
+ /* Return -EFAULT for any failure */
.L_memcpy_mcsafe_fail:
- mov $1, %rax
+ mov $-EFAULT, %rax
ret
.previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index c9c81227ea37..e1229ecd2a82 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -9,7 +9,7 @@
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
- * simpler and shorter than the orignal function as well.
+ * simpler and shorter than the original function as well.
*
* rdi destination
* rsi value (char)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 67cf2e1e557b..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o := n
+
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 9dd7e4b7fcde..82447b3fba38 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,17 +1,10 @@
#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sort.h>
#include <asm/uaccess.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
static inline unsigned long
-ex_insn_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
@@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip)
*ip = new_ip;
return 1;
}
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry *mid;
- unsigned long addr;
-
- mid = ((last - first) >> 1) + first;
- addr = ex_insn_addr(mid);
- if (addr < value)
- first = mid + 1;
- else if (addr > value)
- last = mid - 1;
- else
- return mid;
- }
- return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /*
- * This value will always end up fittin in an int, because on
- * both i386 and x86-64 the kernel symbol-reachable address
- * space is < 2 GiB.
- *
- * This compare is only valid after normalization.
- */
- return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- struct exception_table_entry *p;
- int i;
-
- /* Convert all entries to being relative to the start of the section */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn += i;
- i += 4;
- p->fixup += i;
- i += 4;
- p->handler += i;
- i += 4;
- }
-
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, NULL);
-
- /* Denormalize all entries */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn -= i;
- i += 4;
- p->fixup -= i;
- i += 4;
- p->handler -= i;
- i += 4;
- }
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index a0a0b9861902..80476878eb4c 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
/*
* This covers 32-bit emulation as well as 32-bit kernels
- * running on 64-bit harware.
+ * running on 64-bit hardware.
*/
if (!is_64bit_mm(mm))
return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
/*
* 'x86_virt_bits' returns what the hardware is capable
- * of, and returns the full >32-bit adddress space when
+ * of, and returns the full >32-bit address space when
* running 32-bit kernels on 64-bit hardware.
*/
virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 04e2e7144bee..faec01e7a17d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -149,7 +149,7 @@ enum {
PAT_WT = 4, /* Write Through */
PAT_WP = 5, /* Write Protected */
PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+ PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
};
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1d2e6392f5fa..0e07e0968c3a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -437,7 +437,8 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
void *data)
{
int cpu = (unsigned long)data;
- switch (action) {
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_FAILED:
case CPU_ONLINE:
smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
index 0ae7f2ae2296..c26cf393d35a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
@@ -1,5 +1,5 @@
/*
- * platform_bma023.c: bma023 platform data initilization file
+ * platform_bma023.c: bma023 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
*
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 69a783689d21..c259fb6c8f4f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -1,5 +1,5 @@
/*
- * platform_emc1403.c: emc1403 platform data initilization file
+ * platform_emc1403.c: emc1403 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index dccae6b0413f..52534ec29765 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -1,5 +1,5 @@
/*
- * platform_gpio_keys.c: gpio_keys platform data initilization file
+ * platform_gpio_keys.c: gpio_keys platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 54226de7541a..a35cf912de43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -1,5 +1,5 @@
/*
- * platform_lis331.c: lis331 platform data initilization file
+ * platform_lis331.c: lis331 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 2c8acbc1e9ad..6e075afa7877 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -1,5 +1,5 @@
/*
- * platform_max7315.c: max7315 platform data initilization file
+ * platform_max7315.c: max7315 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index cfe9a47a1e87..ee22864bbc2f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -1,5 +1,5 @@
/*
- * platform_mpu3050.c: mpu3050 platform data initilization file
+ * platform_mpu3050.c: mpu3050 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
index 9f4a775a69d6..e421106c11cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
@@ -1,5 +1,5 @@
/*
- * platform_msic.c: MSIC platform data initilization file
+ * platform_msic.c: MSIC platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
index 29629397d2b3..cb3490ecb341 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_audio.c: MSIC audio platform data initilization file
+ * platform_msic_audio.c: MSIC audio platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
index f446c33df1a8..4f72193939a6 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_battery.c: MSIC battery platform data initilization file
+ * platform_msic_battery.c: MSIC battery platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
index 2a4f7b1dd917..70de5b531ba0 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_gpio.c: MSIC GPIO platform data initilization file
+ * platform_msic_gpio.c: MSIC GPIO platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
index 6497111ddb54..3d7c2011b6cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_ocd.c: MSIC OCD platform data initilization file
+ * platform_msic_ocd.c: MSIC OCD platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
index 83a3459bc337..038f618fbc52 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_power_btn.c: MSIC power btn platform data initilization file
+ * platform_msic_power_btn.c: MSIC power btn platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
index a351878b96bc..114a5755b1e4 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_thermal.c: msic_thermal platform data initilization file
+ * platform_msic_thermal.c: msic_thermal platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index 65c2a9a19db4..e30cb62e3300 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -1,5 +1,5 @@
/*
- * platform_pmic_gpio.c: PMIC GPIO platform data initilization file
+ * platform_pmic_gpio.c: PMIC GPIO platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index 740fc757050c..b1526b95fd43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -1,5 +1,5 @@
/*
- * platform_tc35876x.c: tc35876x platform data initilization file
+ * platform_tc35876x.c: tc35876x platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 33be0b3be6e1..4f41372ce400 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -1,5 +1,5 @@
/*
- * platform_tca6416.c: tca6416 platform data initilization file
+ * platform_tca6416.c: tca6416 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
index 3cefba1fefc8..50a4147f91fb 100644
--- a/arch/x86/purgatory/stack.S
+++ b/arch/x86/purgatory/stack.S
@@ -8,7 +8,7 @@
*/
/* A stack for the loaded kernel.
- * Seperate and in the data section so it can be prepopulated.
+ * Separate and in the data section so it can be prepopulated.
*/
.data
.balign 4096
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 053abe7b0ef7..b95964610ea7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -9,6 +9,9 @@
KASAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
always := realmode.bin realmode.relocs
wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index d5644bbe8cba..9fd24846d094 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -14,26 +14,24 @@
int fb_is_primary_device(struct fb_info *info)
{
struct device *device = info->device;
- struct pci_dev *pci_dev = NULL;
struct pci_dev *default_device = vga_default_device();
- struct resource *res = NULL;
+ struct pci_dev *pci_dev;
+ struct resource *res;
- if (device)
- pci_dev = to_pci_dev(device);
-
- if (!pci_dev)
+ if (!device || !dev_is_pci(device))
return 0;
+ pci_dev = to_pci_dev(device);
+
if (default_device) {
if (pci_dev == default_device)
return 1;
- else
- return 0;
+ return 0;
}
- res = &pci_dev->resource[PCI_ROM_RESOURCE];
+ res = pci_dev->resource + PCI_ROM_RESOURCE;
- if (res && res->flags & IORESOURCE_ROM_SHADOW)
+ if (res->flags & IORESOURCE_ROM_SHADOW)
return 1;
return 0;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2379a5a88504..880862c7d9dd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -962,7 +962,7 @@ static void xen_load_sp0(struct tss_struct *tss,
tss->x86_tss.sp0 = thread->sp0;
}
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c913ca4f6958..478a2de543a5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1256,7 +1256,7 @@ static void __init xen_pagetable_cleanhighmap(void)
xen_cleanhighmap(addr, addr + size);
xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
#ifdef DEBUG
- /* This is superflous and is not neccessary, but you know what
+ /* This is superfluous and is not necessary, but you know what
* lets do it. The MODULES_VADDR -> MODULES_END should be clear of
* anything at this stage. */
xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
@@ -1474,7 +1474,7 @@ static void xen_write_cr3(unsigned long cr3)
/*
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
- * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * in xen_setup_kernel_pagetable and graft as appropriate them in the
* init_level4_pgt and its friends. Then when we are happy we load
* the new init_level4_pgt - and continue on.
*
@@ -2792,7 +2792,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
struct remap_data *rmd = data;
pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
- /* If we have a contigious range, just update the mfn itself,
+ /* If we have a contiguous range, just update the mfn itself,
else update pointer to be "next mfn". */
if (rmd->contiguous)
(*rmd->mfn)++;
@@ -2833,7 +2833,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
rmd.mfn = gfn;
rmd.prot = prot;
- /* We use the err_ptr to indicate if there we are doing a contigious
+ /* We use the err_ptr to indicate if there we are doing a contiguous
* mapping or a discontigious mapping. */
rmd.contiguous = !err_ptr;
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index de93b20fa0d2..7f8d8abf4c1a 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -26,7 +26,7 @@
(1 << XENFEAT_auto_translated_physmap) | \
(1 << XENFEAT_supervisor_mode_kernel) | \
(1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
* up regardless whether this CONFIG option is enabled or not, but it
* clarifies what the right flags need to be.
*/
OpenPOWER on IntegriCloud