diff options
Diffstat (limited to 'arch/x86')
94 files changed, 1451 insertions, 358 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8b680a5cb25b..2dc18605831f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN @@ -1209,6 +1210,15 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE +config PERF_EVENTS_AMD_POWER + depends on PERF_EVENTS && CPU_SUP_AMD + tristate "AMD Processor Power Reporting Mechanism" + ---help--- + Provide power reporting mechanism support for AMD processors. + Currently, it leverages X86_FEATURE_ACC_POWER + (CPUID Fn8000_0007_EDX[12]) interface to calculate the + average power consumption on Family 15h processors. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0bf6749522d9..b1ef9e489084 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,13 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Kernel does not boot with kcov instrumentation here. +# One of the problems observed was insertion of __sanitizer_cov_trace_pc() +# callback into middle of per-cpu data enabling code. Thus the callback observed +# inconsistent state and crashed. We are interested mostly in syscall coverage, +# so boot code is not interesting anyway. +KCOV_INSTRUMENT := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5e1d26e09407..6915ff2bd996 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,6 +19,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index f9fb859c98b9..6874da5f67fc 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -7,6 +7,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 1a50e09c945b..03c3eb77bfce 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -178,7 +178,7 @@ notrace static cycle_t vread_tsc(void) /* * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is + * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index fdfea1511cc0..f59618a39990 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,6 +1,7 @@ obj-y += core.o obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 51087c29b2c2..3ea25c3917c0 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + /* + * Set STARTED before enabling the hardware, such that + * a subsequent NMI must observe it. Then clear STOPPING + * such that we don't consume NMIs by accident. + */ set_bit(IBS_STARTED, pcpu->state); + clear_bit(IBS_STOPPING, pcpu->state); perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); @@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) u64 config; int stopping; - stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + stopping = test_bit(IBS_STARTED, pcpu->state); if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return; @@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags) rdmsrl(hwc->config_base, config); if (stopping) { + /* + * Set STOPPING before disabling the hardware, such that it + * must be visible to NMIs the moment we clear the EN bit, + * at which point we can generate an !VALID sample which + * we need to consume. + */ set_bit(IBS_STOPPING, pcpu->state); perf_ibs_disable_event(perf_ibs, hwc, config); + /* + * Clear STARTED after disabling the hardware; if it were + * cleared before an NMI hitting after the clear but before + * clearing the EN bit might think it a spurious NMI and not + * handle it. + * + * Clearing it after, however, creates the problem of the NMI + * handler seeing STARTED but not having a valid sample. + */ + clear_bit(IBS_STARTED, pcpu->state); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { +fail: /* * Catch spurious interrupts after stopping IBS: After * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ - return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + if (test_and_clear_bit(IBS_STOPPING, pcpu->state)) + return 1; + + return 0; } msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); if (!(*buf++ & perf_ibs->valid_mask)) - return 0; + goto fail; config = &ibs_data.regs[0]; perf_ibs_event_update(perf_ibs, event, config); @@ -599,7 +625,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) - perf_ibs_disable_event(perf_ibs, hwc, *config); + perf_ibs_stop(event, 0); else perf_ibs_enable_event(perf_ibs, hwc, period >> 4); @@ -611,6 +637,7 @@ out: static int perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + u64 stamp = sched_clock(); int handled = 0; handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); @@ -619,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) if (handled) inc_irq_stat(apic_perf_irqs); + perf_sample_event_took(sched_clock() - stamp); + return handled; } NOKPROBE_SYMBOL(perf_ibs_nmi_handler); diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 635e5eba0caf..40625ca7a190 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"), + AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"), + AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"), + AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"), + AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"), { /* end: all zeroes */ }, }; diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c new file mode 100644 index 000000000000..55a3529dbf12 --- /dev/null +++ b/arch/x86/events/amd/power.c @@ -0,0 +1,353 @@ +/* + * Performance events - AMD Processor Power Reporting Mechanism + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Huang Rui <ray.huang@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "../perf_event.h" + +#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a +#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b +#define MSR_F15H_PTSC 0xc0010280 + +/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ +#define AMD_POWER_EVENT_MASK 0xFFULL + +/* + * Accumulated power status counters. + */ +#define AMD_POWER_EVENTSEL_PKG 1 + +/* + * The ratio of compute unit power accumulator sample period to the + * PTSC period. + */ +static unsigned int cpu_pwr_sample_ratio; + +/* Maximum accumulated power of a compute unit. */ +static u64 max_cu_acc_power; + +static struct pmu pmu_class; + +/* + * Accumulated power represents the sum of each compute unit's (CU) power + * consumption. On any core of each CU we read the total accumulated power from + * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores + * which are picked to measure the power for the CUs they belong to. + */ +static cpumask_t cpu_mask; + +static void event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc; + u64 delta, tdelta; + + prev_pwr_acc = hwc->pwr_acc; + prev_ptsc = hwc->ptsc; + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc); + rdmsrl(MSR_F15H_PTSC, new_ptsc); + + /* + * Calculate the CU power consumption over a time period, the unit of + * final value (delta) is micro-Watts. Then add it to the event count. + */ + if (new_pwr_acc < prev_pwr_acc) { + delta = max_cu_acc_power + new_pwr_acc; + delta -= prev_pwr_acc; + } else + delta = new_pwr_acc - prev_pwr_acc; + + delta *= cpu_pwr_sample_ratio * 1000; + tdelta = new_ptsc - prev_ptsc; + + do_div(delta, tdelta); + local64_add(delta, &event->count); +} + +static void __pmu_event_start(struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + rdmsrl(MSR_F15H_PTSC, event->hw.ptsc); + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc); +} + +static void pmu_event_start(struct perf_event *event, int mode) +{ + __pmu_event_start(event); +} + +static void pmu_event_stop(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + /* Mark event as deactivated and stopped. */ + if (!(hwc->state & PERF_HES_STOPPED)) + hwc->state |= PERF_HES_STOPPED; + + /* Check if software counter update is necessary. */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of an event + * that we are disabling: + */ + event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int pmu_event_add(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __pmu_event_start(event); + + return 0; +} + +static void pmu_event_del(struct perf_event *event, int flags) +{ + pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK; + + /* Only look at AMD power events. */ + if (event->attr.type != pmu_class.type) + return -ENOENT; + + /* Unsupported modes and filters. */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + /* no sampling */ + event->attr.sample_period) + return -EINVAL; + + if (cfg != AMD_POWER_EVENTSEL_PKG) + return -EINVAL; + + return 0; +} + +static void pmu_event_read(struct perf_event *event) +{ + event_update(event); +} + +static ssize_t +get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &cpu_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL); + +static struct attribute *pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group pmu_attr_group = { + .attrs = pmu_attrs, +}; + +/* + * Currently it only supports to report the power of each + * processor/package. + */ +EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01"); + +EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts"); + +/* Convert the count from micro-Watts to milli-Watts. */ +EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3"); + +static struct attribute *events_attr[] = { + EVENT_PTR(power_pkg), + EVENT_PTR(power_pkg_unit), + EVENT_PTR(power_pkg_scale), + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = formats_attr, +}; + +static const struct attribute_group *attr_groups[] = { + &pmu_attr_group, + &pmu_format_group, + &pmu_events_group, + NULL, +}; + +static struct pmu pmu_class = { + .attr_groups = attr_groups, + /* system-wide only */ + .task_ctx_nr = perf_invalid_context, + .event_init = pmu_event_init, + .add = pmu_event_add, + .del = pmu_event_del, + .start = pmu_event_start, + .stop = pmu_event_stop, + .read = pmu_event_read, +}; + +static void power_cpu_exit(int cpu) +{ + int target; + + if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask)) + return; + + /* + * Find a new CPU on the same compute unit, if was set in cpumask + * and still some CPUs on compute unit. Then migrate event and + * context to new CPU. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target < nr_cpumask_bits) { + cpumask_set_cpu(target, &cpu_mask); + perf_pmu_migrate_context(&pmu_class, cpu, target); + } +} + +static void power_cpu_init(int cpu) +{ + int target; + + /* + * 1) If any CPU is set at cpu_mask in the same compute unit, do + * nothing. + * 2) If no CPU is set at cpu_mask in the same compute unit, + * set current STARTING CPU. + * + * Note: if there is a CPU aside of the new one already in the + * sibling mask, then it is also in cpu_mask. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target >= nr_cpumask_bits) + cpumask_set_cpu(cpu, &cpu_mask); +} + +static int +power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + power_cpu_init(cpu); + break; + case CPU_DOWN_PREPARE: + power_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block power_cpu_notifier_nb = { + .notifier_call = power_cpu_notifier, + .priority = CPU_PRI_PERF, +}; + +static const struct x86_cpu_id cpu_match[] = { + { .vendor = X86_VENDOR_AMD, .family = 0x15 }, + {}, +}; + +static int __init amd_power_pmu_init(void) +{ + int cpu, target, ret; + + if (!x86_match_cpu(cpu_match)) + return 0; + + if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) + return -ENODEV; + + cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); + + if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) { + pr_err("Failed to read max compute unit power accumulator MSR\n"); + return -ENODEV; + } + + cpu_notifier_register_begin(); + + /* Choose one online core of each compute unit. */ + for_each_online_cpu(cpu) { + target = cpumask_first(topology_sibling_cpumask(cpu)); + if (!cpumask_test_cpu(target, &cpu_mask)) + cpumask_set_cpu(target, &cpu_mask); + } + + ret = perf_pmu_register(&pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_warn("AMD Power PMU registration failed\n"); + goto out; + } + + __register_cpu_notifier(&power_cpu_notifier_nb); + + pr_info("AMD Power PMU detected\n"); + +out: + cpu_notifier_register_done(); + + return ret; +} +module_init(amd_power_pmu_init); + +static void __exit amd_power_pmu_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&power_cpu_notifier_nb); + cpu_notifier_register_done(); + + perf_pmu_unregister(&pmu_class); +} +module_exit(amd_power_pmu_exit); + +MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>"); +MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 9b6ad08aa51a..041e442a3e28 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) return new; } -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, - char *page) +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ container_of(attr, struct perf_pmu_events_attr, attr); @@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, return x86_pmu.events_sysfs_show(page, config); } +EXPORT_SYMBOL_GPL(events_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 93cb412a5579..7b5fd811ef45 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -13,8 +13,16 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d +#define MBM_CNTR_WIDTH 24 +/* + * Guaranteed time in ms as per SDM where MBM counters will not overflow. + */ +#define MBM_CTR_OVERFLOW_TIME 1000 + static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ +static bool cqm_enabled, mbm_enabled; +unsigned int mbm_socket_max; /** * struct intel_pqr_state - State cache for the PQR MSR @@ -42,8 +50,37 @@ struct intel_pqr_state { * interrupts disabled, which is sufficient for the protection. */ static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +static struct hrtimer *mbm_timers; +/** + * struct sample - mbm event's (local or total) data + * @total_bytes #bytes since we began monitoring + * @prev_msr previous value of MSR + */ +struct sample { + u64 total_bytes; + u64 prev_msr; +}; /* + * samples profiled for total memory bandwidth type events + */ +static struct sample *mbm_total; +/* + * samples profiled for local memory bandwidth type events + */ +static struct sample *mbm_local; + +#define pkg_id topology_physical_package_id(smp_processor_id()) +/* + * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. + * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of + * rmids per socket, an example is given below + * RMID1 of Socket0: vrmid = 1 + * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 + * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 + */ +#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) +/* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. * Also protects event->hw.cqm_rmid * @@ -65,9 +102,13 @@ static cpumask_t cqm_cpumask; #define RMID_VAL_ERROR (1ULL << 63) #define RMID_VAL_UNAVAIL (1ULL << 62) -#define QOS_L3_OCCUP_EVENT_ID (1 << 0) - -#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_MBM_LOCAL_EVENT_ID 0x03 /* * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). @@ -211,6 +252,21 @@ static void __put_rmid(u32 rmid) list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } +static void cqm_cleanup(void) +{ + int i; + + if (!cqm_rmid_ptrs) + return; + + for (i = 0; i < cqm_max_rmid; i++) + kfree(cqm_rmid_ptrs[i]); + + kfree(cqm_rmid_ptrs); + cqm_rmid_ptrs = NULL; + cqm_enabled = false; +} + static int intel_cqm_setup_rmid_cache(void) { struct cqm_rmid_entry *entry; @@ -218,7 +274,7 @@ static int intel_cqm_setup_rmid_cache(void) int r = 0; nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * + cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * nr_rmids, GFP_KERNEL); if (!cqm_rmid_ptrs) return -ENOMEM; @@ -249,11 +305,9 @@ static int intel_cqm_setup_rmid_cache(void) mutex_unlock(&cache_mutex); return 0; -fail: - while (r--) - kfree(cqm_rmid_ptrs[r]); - kfree(cqm_rmid_ptrs); +fail: + cqm_cleanup(); return -ENOMEM; } @@ -281,9 +335,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. + * Mark it as a multi event group, so that we update ->count + * for every event rather than just the group leader later. */ - if (a->hw.target == b->hw.target) + if (a->hw.target == b->hw.target) { + b->hw.is_group_event = true; return true; + } /* * Are we an inherited event? @@ -392,10 +450,26 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) struct rmid_read { u32 rmid; + u32 evt_type; atomic64_t value; }; static void __intel_cqm_event_count(void *info); +static void init_mbm_sample(u32 rmid, u32 evt_type); +static void __intel_mbm_event_count(void *info); + +static bool is_mbm_event(int e) +{ + return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); +} + +static void cqm_mask_call(struct rmid_read *rr) +{ + if (is_mbm_event(rr->evt_type)) + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); + else + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); +} /* * Exchange the RMID of a group of events. @@ -413,12 +487,12 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) */ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { struct rmid_read rr = { - .value = ATOMIC64_INIT(0), .rmid = old_rmid, + .evt_type = group->attr.config, + .value = ATOMIC64_INIT(0), }; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, - &rr, 1); + cqm_mask_call(&rr); local64_set(&group->count, atomic64_read(&rr.value)); } @@ -430,6 +504,22 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) raw_spin_unlock_irq(&cache_lock); + /* + * If the allocation is for mbm, init the mbm stats. + * Need to check if each event in the group is mbm event + * because there could be multiple type of events in the same group. + */ + if (__rmid_valid(rmid)) { + event = group; + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + + list_for_each_entry(event, head, hw.cqm_group_entry) { + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + } + } + return old_rmid; } @@ -837,6 +927,72 @@ static void intel_cqm_rmid_rotate(struct work_struct *work) schedule_delayed_work(&intel_cqm_rmid_work, delay); } +static u64 update_sample(unsigned int rmid, u32 evt_type, int first) +{ + struct sample *mbm_current; + u32 vrmid = rmid_2_index(rmid); + u64 val, bytes, shift; + u32 eventid; + + if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { + mbm_current = &mbm_local[vrmid]; + eventid = QOS_MBM_LOCAL_EVENT_ID; + } else { + mbm_current = &mbm_total[vrmid]; + eventid = QOS_MBM_TOTAL_EVENT_ID; + } + + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return mbm_current->total_bytes; + + if (first) { + mbm_current->prev_msr = val; + mbm_current->total_bytes = 0; + return mbm_current->total_bytes; + } + + /* + * The h/w guarantees that counters will not overflow + * so long as we poll them at least once per second. + */ + shift = 64 - MBM_CNTR_WIDTH; + bytes = (val << shift) - (mbm_current->prev_msr << shift); + bytes >>= shift; + + bytes *= cqm_l3_scale; + + mbm_current->total_bytes += bytes; + mbm_current->prev_msr = val; + + return mbm_current->total_bytes; +} + +static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) +{ + return update_sample(rmid, evt_type, 0); +} + +static void __intel_mbm_event_init(void *info) +{ + struct rmid_read *rr = info; + + update_sample(rr->rmid, rr->evt_type, 1); +} + +static void init_mbm_sample(u32 rmid, u32 evt_type) +{ + struct rmid_read rr = { + .rmid = rmid, + .evt_type = evt_type, + .value = ATOMIC64_INIT(0), + }; + + /* on each socket, init sample */ + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); +} + /* * Find a group and setup RMID. * @@ -849,6 +1005,7 @@ static void intel_cqm_setup_event(struct perf_event *event, bool conflict = false; u32 rmid; + event->hw.is_group_event = false; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { rmid = iter->hw.cqm_rmid; @@ -856,6 +1013,8 @@ static void intel_cqm_setup_event(struct perf_event *event, /* All tasks in a group share an RMID */ event->hw.cqm_rmid = rmid; *group = iter; + if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) + init_mbm_sample(rmid, event->attr.config); return; } @@ -872,6 +1031,9 @@ static void intel_cqm_setup_event(struct perf_event *event, else rmid = __get_rmid(); + if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) + init_mbm_sample(rmid, event->attr.config); + event->hw.cqm_rmid = rmid; } @@ -893,7 +1055,10 @@ static void intel_cqm_event_read(struct perf_event *event) if (!__rmid_valid(rmid)) goto out; - val = __rmid_read(rmid); + if (is_mbm_event(event->attr.config)) + val = rmid_read_mbm(rmid, event->attr.config); + else + val = __rmid_read(rmid); /* * Ignore this reading on error states and do not update the value. @@ -924,10 +1089,100 @@ static inline bool cqm_group_leader(struct perf_event *event) return !list_empty(&event->hw.cqm_groups_entry); } +static void __intel_mbm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = rmid_read_mbm(rr->rmid, rr->evt_type); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + atomic64_add(val, &rr->value); +} + +static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct perf_event *iter, *iter1; + int ret = HRTIMER_RESTART; + struct list_head *head; + unsigned long flags; + u32 grp_rmid; + + /* + * Need to cache_lock as the timer Event Select MSR reads + * can race with the mbm/cqm count() and mbm_init() reads. + */ + raw_spin_lock_irqsave(&cache_lock, flags); + + if (list_empty(&cache_groups)) { + ret = HRTIMER_NORESTART; + goto out; + } + + list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + grp_rmid = iter->hw.cqm_rmid; + if (!__rmid_valid(grp_rmid)) + continue; + if (is_mbm_event(iter->attr.config)) + update_sample(grp_rmid, iter->attr.config, 0); + + head = &iter->hw.cqm_group_entry; + if (list_empty(head)) + continue; + list_for_each_entry(iter1, head, hw.cqm_group_entry) { + if (!iter1->hw.is_group_event) + break; + if (is_mbm_event(iter1->attr.config)) + update_sample(iter1->hw.cqm_rmid, + iter1->attr.config, 0); + } + } + + hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); +out: + raw_spin_unlock_irqrestore(&cache_lock, flags); + + return ret; +} + +static void __mbm_start_timer(void *info) +{ + hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), + HRTIMER_MODE_REL_PINNED); +} + +static void __mbm_stop_timer(void *info) +{ + hrtimer_cancel(&mbm_timers[pkg_id]); +} + +static void mbm_start_timers(void) +{ + on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); +} + +static void mbm_stop_timers(void) +{ + on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); +} + +static void mbm_hrtimer_init(void) +{ + struct hrtimer *hr; + int i; + + for (i = 0; i < mbm_socket_max; i++) { + hr = &mbm_timers[i]; + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = mbm_hrtimer_handle; + } +} + static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; struct rmid_read rr = { + .evt_type = event->attr.config, .value = ATOMIC64_INIT(0), }; @@ -940,7 +1195,9 @@ static u64 intel_cqm_event_count(struct perf_event *event) return __perf_event_count(event); /* - * Only the group leader gets to report values. This stops us + * Only the group leader gets to report values except in case of + * multiple events in the same group, we still need to read the + * other events.This stops us * reporting duplicate values to userspace, and gives us a clear * rule for which task gets to report the values. * @@ -948,7 +1205,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) * specific packages - we forfeit that ability when we create * task events. */ - if (!cqm_group_leader(event)) + if (!cqm_group_leader(event) && !event->hw.is_group_event) return 0; /* @@ -975,7 +1232,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!__rmid_valid(rr.rmid)) goto out; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + cqm_mask_call(&rr); raw_spin_lock_irqsave(&cache_lock, flags); if (event->hw.cqm_rmid == rr.rmid) @@ -1046,8 +1303,14 @@ static int intel_cqm_event_add(struct perf_event *event, int mode) static void intel_cqm_event_destroy(struct perf_event *event) { struct perf_event *group_other = NULL; + unsigned long flags; mutex_lock(&cache_mutex); + /* + * Hold the cache_lock as mbm timer handlers could be + * scanning the list of events. + */ + raw_spin_lock_irqsave(&cache_lock, flags); /* * If there's another event in this group... @@ -1079,6 +1342,14 @@ static void intel_cqm_event_destroy(struct perf_event *event) } } + raw_spin_unlock_irqrestore(&cache_lock, flags); + + /* + * Stop the mbm overflow timers when the last event is destroyed. + */ + if (mbm_enabled && list_empty(&cache_groups)) + mbm_stop_timers(); + mutex_unlock(&cache_mutex); } @@ -1086,11 +1357,13 @@ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; bool rotate = false; + unsigned long flags; if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; - if (event->attr.config & ~QOS_EVENT_MASK) + if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || + (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; /* unsupported modes and filters */ @@ -1110,9 +1383,21 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); + /* + * Start the mbm overflow timers when the first event is created. + */ + if (mbm_enabled && list_empty(&cache_groups)) + mbm_start_timers(); + /* Will also set rmid */ intel_cqm_setup_event(event, &group); + /* + * Hold the cache_lock as mbm timer handlers be + * scanning the list of events. + */ + raw_spin_lock_irqsave(&cache_lock, flags); + if (group) { list_add_tail(&event->hw.cqm_group_entry, &group->hw.cqm_group_entry); @@ -1131,6 +1416,7 @@ static int intel_cqm_event_init(struct perf_event *event) rotate = true; } + raw_spin_unlock_irqrestore(&cache_lock, flags); mutex_unlock(&cache_mutex); if (rotate) @@ -1145,6 +1431,16 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); +EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); +EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); +EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); +EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); + +EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); +EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); +EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); +EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); + static struct attribute *intel_cqm_events_attr[] = { EVENT_PTR(intel_cqm_llc), EVENT_PTR(intel_cqm_llc_pkg), @@ -1154,9 +1450,38 @@ static struct attribute *intel_cqm_events_attr[] = { NULL, }; +static struct attribute *intel_mbm_events_attr[] = { + EVENT_PTR(intel_cqm_total_bytes), + EVENT_PTR(intel_cqm_local_bytes), + EVENT_PTR(intel_cqm_total_bytes_pkg), + EVENT_PTR(intel_cqm_local_bytes_pkg), + EVENT_PTR(intel_cqm_total_bytes_unit), + EVENT_PTR(intel_cqm_local_bytes_unit), + EVENT_PTR(intel_cqm_total_bytes_scale), + EVENT_PTR(intel_cqm_local_bytes_scale), + NULL, +}; + +static struct attribute *intel_cmt_mbm_events_attr[] = { + EVENT_PTR(intel_cqm_llc), + EVENT_PTR(intel_cqm_total_bytes), + EVENT_PTR(intel_cqm_local_bytes), + EVENT_PTR(intel_cqm_llc_pkg), + EVENT_PTR(intel_cqm_total_bytes_pkg), + EVENT_PTR(intel_cqm_local_bytes_pkg), + EVENT_PTR(intel_cqm_llc_unit), + EVENT_PTR(intel_cqm_total_bytes_unit), + EVENT_PTR(intel_cqm_local_bytes_unit), + EVENT_PTR(intel_cqm_llc_scale), + EVENT_PTR(intel_cqm_total_bytes_scale), + EVENT_PTR(intel_cqm_local_bytes_scale), + EVENT_PTR(intel_cqm_llc_snapshot), + NULL, +}; + static struct attribute_group intel_cqm_events_group = { .name = "events", - .attrs = intel_cqm_events_attr, + .attrs = NULL, }; PMU_FORMAT_ATTR(event, "config:0-7"); @@ -1303,12 +1628,70 @@ static const struct x86_cpu_id intel_cqm_match[] = { {} }; +static void mbm_cleanup(void) +{ + if (!mbm_enabled) + return; + + kfree(mbm_local); + kfree(mbm_total); + mbm_enabled = false; +} + +static const struct x86_cpu_id intel_mbm_local_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, + {} +}; + +static const struct x86_cpu_id intel_mbm_total_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, + {} +}; + +static int intel_mbm_init(void) +{ + int ret = 0, array_size, maxid = cqm_max_rmid + 1; + + mbm_socket_max = topology_max_packages(); + array_size = sizeof(struct sample) * maxid * mbm_socket_max; + mbm_local = kmalloc(array_size, GFP_KERNEL); + if (!mbm_local) + return -ENOMEM; + + mbm_total = kmalloc(array_size, GFP_KERNEL); + if (!mbm_total) { + ret = -ENOMEM; + goto out; + } + + array_size = sizeof(struct hrtimer) * mbm_socket_max; + mbm_timers = kmalloc(array_size, GFP_KERNEL); + if (!mbm_timers) { + ret = -ENOMEM; + goto out; + } + mbm_hrtimer_init(); + +out: + if (ret) + mbm_cleanup(); + + return ret; +} + static int __init intel_cqm_init(void) { - char *str, scale[20]; + char *str = NULL, scale[20]; int i, cpu, ret; - if (!x86_match_cpu(intel_cqm_match)) + if (x86_match_cpu(intel_cqm_match)) + cqm_enabled = true; + + if (x86_match_cpu(intel_mbm_local_match) && + x86_match_cpu(intel_mbm_total_match)) + mbm_enabled = true; + + if (!cqm_enabled && !mbm_enabled) return -ENODEV; cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; @@ -1365,16 +1748,41 @@ static int __init intel_cqm_init(void) cqm_pick_event_reader(i); } - __perf_cpu_notifier(intel_cqm_cpu_notifier); + if (mbm_enabled) + ret = intel_mbm_init(); + if (ret && !cqm_enabled) + goto out; + + if (cqm_enabled && mbm_enabled) + intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; + else if (!cqm_enabled && mbm_enabled) + intel_cqm_events_group.attrs = intel_mbm_events_attr; + else if (cqm_enabled && !mbm_enabled) + intel_cqm_events_group.attrs = intel_cqm_events_attr; ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) + if (ret) { pr_err("Intel CQM perf registration failed: %d\n", ret); - else + goto out; + } + + if (cqm_enabled) pr_info("Intel CQM monitoring enabled\n"); + if (mbm_enabled) + pr_info("Intel MBM enabled\n"); + /* + * Register the hot cpu notifier once we are sure cqm + * is enabled to avoid notifier leak. + */ + __perf_cpu_notifier(intel_cqm_cpu_notifier); out: cpu_notifier_register_done(); + if (ret) { + kfree(str); + cqm_cleanup(); + mbm_cleanup(); + } return ret; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ce7211a07c0b..8584b90d8e0b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void) * We will overwrite the from and to address before we output * the sample. */ + rcu_read_lock(); perf_prepare_sample(&header, &data, event, ®s); if (perf_output_begin(&handle, event, header.size * (top - base - skip))) - return 1; + goto unlock; for (at = base; at < top; at++) { /* Filter out any records that contain kernel addresses. */ @@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void) /* There's new data available. */ event->hw.interrupts++; event->pending_kill = POLL_IN; +unlock: + rcu_read_unlock(); return 1; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 69dd11887dd1..6c3b7c1780c9 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -649,7 +649,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * return the type of control flow change at address "from" - * intruction is not necessarily a branch (in case of interrupt). + * instruction is not necessarily a branch (in case of interrupt). * * The branch type returned also includes the priv level of the * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index b834a3f55a01..70c93f9b03ac 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void) rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; case 63: /* Haswell-Server */ + case 79: /* Broadwell-Server */ apply_quirk = true; rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; @@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void) case 60: /* Haswell */ case 69: /* Haswell-Celeron */ case 61: /* Broadwell */ + case 71: /* Broadwell-H */ rapl_cntr_mask = RAPL_IDX_HSW; rapl_pmu_events_group.attrs = rapl_events_hsw_attr; break; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 93f6bd9bf761..ab2bcaaebe38 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -46,7 +46,6 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ @@ -148,7 +147,6 @@ /* IVBEP PCU */ #define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ @@ -258,7 +256,6 @@ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ SNBEP_CBO_PMON_CTL_TID_EN | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ @@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = { }; static struct attribute *ivbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_thresh5.attr, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 68155cafa8a1..ba6ef18528c9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -272,7 +272,7 @@ struct cpu_hw_events { * events to select for counter rescheduling. * * Care must be taken as the rescheduling algorithm is O(n!) which - * will increase scheduling cycles for an over-commited system + * will increase scheduling cycles for an over-committed system * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros * and its counter masks must be kept at a minimum. */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 0899cfc8dfe8..98f25bbafac4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -643,8 +643,8 @@ static inline void entering_irq(void) static inline void entering_ack_irq(void) { - ack_APIC_irq(); entering_irq(); + ack_APIC_irq(); } static inline void ipi_entering_ack_irq(void) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index acdee09228b3..ebb102e1bbc7 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -316,9 +316,10 @@ static inline bool is_x32_task(void) return false; } -static inline bool is_compat_task(void) +static inline bool in_compat_syscall(void) { return is_ia32_task() || is_x32_task(); } +#define in_compat_syscall in_compat_syscall /* override the generic impl */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 3d1a84383162..8f9afefd2dc5 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -94,7 +94,7 @@ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ #define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ #define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ @@ -245,6 +245,8 @@ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 24938852db30..a4820d4df617 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -52,13 +52,13 @@ int ftrace_int3_handler(struct pt_regs *regs); * this screws up the trace output when tracing a ia32 task. * Instead of reporting bogus syscalls, just do not trace them. * - * If the user realy wants these, then they should use the + * If the user really wants these, then they should use the * raw syscall tracepoints with filtering. */ #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) { - if (is_compat_task()) + if (in_compat_syscall()) return true; return false; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1815b736269d..b90e1053049b 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -141,6 +141,7 @@ struct irq_alloc_info { struct irq_cfg { unsigned int dest_apicid; u8 vector; + u8 old_vector; }; extern struct irq_cfg *irq_cfg(unsigned int irq); @@ -168,20 +169,6 @@ extern atomic_t irq_mis_count; extern void elcr_set_level_irq(unsigned int irq); -/* SMP */ -extern __visible void smp_apic_timer_interrupt(struct pt_regs *); -extern __visible void smp_spurious_interrupt(struct pt_regs *); -extern __visible void smp_x86_platform_ipi(struct pt_regs *); -extern __visible void smp_error_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_IO_APIC -extern asmlinkage void smp_irq_move_cleanup_interrupt(void); -#endif -#ifdef CONFIG_SMP -extern __visible void smp_reschedule_interrupt(struct pt_regs *); -extern __visible void smp_call_function_interrupt(struct pt_regs *); -extern __visible void smp_call_function_single_interrupt(struct pt_regs *); -#endif - extern char irq_entries_start[]; #ifdef CONFIG_TRACING #define trace_irq_entries_start irq_entries_start diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 01c8b501cb6d..f62a9f37f79f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -84,7 +84,8 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ + | X86_CR4_PKE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -187,12 +188,14 @@ enum { #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 +#define PFERR_PK_BIT 5 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) #define PFERR_USER_MASK (1U << PFERR_USER_BIT) #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) +#define PFERR_PK_MASK (1U << PFERR_PK_BIT) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -335,6 +338,14 @@ struct kvm_mmu { */ u8 permissions[16]; + /* + * The pkru_mask indicates if protection key checks are needed. It + * consists of 16 domains indexed by page fault error code bits [4:1], + * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. + * Each domain has 2 bits which are ANDed with AD and WD from PKRU. + */ + u32 pkru_mask; + u64 *pae_root; u64 *lm_root; @@ -874,6 +885,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 93fb7c1cffda..7a79ee2778b3 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -42,14 +42,6 @@ struct saved_msrs { struct saved_msr *array; }; -static inline unsigned long long native_read_tscp(unsigned int *aux) -{ - unsigned long low, high; - asm volatile(".byte 0x0f,0x01,0xf9" - : "=a" (low), "=d" (high), "=c" (*aux)); - return low | ((u64)high << 32); -} - /* * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" * constraint has different meanings. For i386, "A" means exactly diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1ff49ec29ece..97f3242e133c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -107,6 +107,12 @@ static inline u32 read_pkru(void) return 0; } +static inline void write_pkru(u32 pkru) +{ + if (boot_cpu_has(X86_FEATURE_OSPKE)) + __write_pkru(pkru); +} + static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index cad82c9c2fde..ceec86eb68e9 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -25,7 +25,7 @@ * This should be totally fair - if anything is waiting, a process that wants a * lock will go to the back of the queue. When the currently active lock is * released, if there's a writer at the front of the queue, then that and only - * that will be woken up; if there's a bunch of consequtive readers at the + * that will be woken up; if there's a bunch of consecutive readers at the * front, then they'll all be woken up, but no other readers will be. */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aee6e76e561e..d96d04377765 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -113,11 +113,27 @@ static inline u32 __read_pkru(void) : "c" (ecx)); return pkru; } + +static inline void __write_pkru(u32 pkru) +{ + u32 ecx = 0, edx = 0; + + /* + * "wrpkru" instruction. Loads contents in EAX to PKRU, + * requires that ecx = edx = 0. + */ + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (pkru), "c"(ecx), "d"(edx)); +} #else static inline u32 __read_pkru(void) { return 0; } + +static inline void __write_pkru(u32 pkru) +{ +} #endif static inline void native_wbinvd(void) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index ca6ba3607705..90dbbd9666d4 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -87,9 +87,9 @@ int strcmp(const char *cs, const char *ct); * * Low level memory copy function that catches machine checks * - * Return true for success, false for fail + * Return 0 for success, -EFAULT for fail */ -bool memcpy_mcsafe(void *dst, const void *src, size_t cnt); +int memcpy_mcsafe(void *dst, const void *src, size_t cnt); #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 88bff6dd23ad..a969ae607be8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -105,9 +105,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un struct exception_table_entry { int insn, fixup, handler; }; -/* This is not the generic standard exception_table_entry format */ -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE + +#define ARCH_HAS_RELATIVE_EXTABLE extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 8b2d4bea9962..39171b3646bb 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num); void xen_arch_unregister_cpu(int num); #endif +extern void xen_set_iopl_mask(unsigned mask); + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d5fb0871aba3..adaae2c781c1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e75907601a41..8c2f1ef6ca23 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -956,7 +956,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) /* * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, @@ -984,7 +984,7 @@ static int __init acpi_parse_madt_lapic_entries(void) /* * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 222a57076039..cefacbad1531 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -221,7 +221,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, unsigned long cpu = (unsigned long)hcpu; struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - switch (action & 0xf) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DEAD: dw_apb_clockevent_pause(adev->timer); if (system_state == SYSTEM_RUNNING) { diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8bb12ddc5db8..8e63ebdcbd0b 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,6 +2,10 @@ # Makefile for local APIC drivers and for the IO-APIC code # +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 531b9611c51d..d356987a04e9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1611,7 +1611,7 @@ void __init enable_IR_x2apic(void) legacy_pic->mask_all(); mask_ioapic_entries(); - /* If irq_remapping_prepare() succeded, try to enable it */ + /* If irq_remapping_prepare() succeeded, try to enable it */ if (ir_stat >= 0) ir_stat = try_to_enable_IR(); /* ir_stat contains the remap mode or an error code */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3b670df4ba7b..ad59d70bcb1a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -213,6 +213,7 @@ update: */ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); d->move_in_progress = !cpumask_empty(d->old_domain); + d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); success: @@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg) } /* - * Called with @desc->lock held and interrupts disabled. + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */ void irq_force_complete_move(struct irq_desc *desc) { struct irq_data *irqdata = irq_desc_get_irq_data(desc); struct apic_chip_data *data = apic_chip_data(irqdata); struct irq_cfg *cfg = data ? &data->cfg : NULL; + unsigned int cpu; if (!cfg) return; - __irq_complete_move(cfg, cfg->vector); - /* * This is tricky. If the cleanup of @data->old_domain has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * - * The cleanup cannot make progress because we hold @desc->lock. So in - * case @data->old_domain is not yet cleaned up, we need to drop the - * lock and acquire it again. @desc cannot go away, because the - * hotplug code holds the sparse irq lock. + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. */ raw_spin_lock(&vector_lock); - /* Clean out all offline cpus (including ourself) first. */ + /* + * Clean out all offline cpus (including the outgoing one) from the + * old_domain mask. + */ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - while (!cpumask_empty(data->old_domain)) { + + /* + * If move_in_progress is cleared and the old_domain mask is empty, + * then there is nothing to cleanup. fixup_irqs() will take care of + * the stale vectors on the outgoing cpu. + */ + if (!data->move_in_progress && cpumask_empty(data->old_domain)) { raw_spin_unlock(&vector_lock); - raw_spin_unlock(&desc->lock); - cpu_relax(); - raw_spin_lock(&desc->lock); + return; + } + + /* + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (data->move_in_progress) { /* - * Reevaluate apic_chip_data. It might have been cleared after - * we dropped @desc->lock. + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendevouz in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theroretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. */ - data = apic_chip_data(irqdata); - if (!data) - return; - raw_spin_lock(&vector_lock); + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqdata->irq, cfg->old_vector); } + /* + * If old_domain is not empty, then other cpus still have the irq + * descriptor set in their vector array. Clean it up. + */ + for_each_cpu(cpu, data->old_domain) + per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; + + /* Cleanup the left overs of the (half finished) move */ + cpumask_clear(data->old_domain); + data->move_in_progress = 0; raw_spin_unlock(&vector_lock); } #endif diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 624db00583f4..8f4942e2bcbb 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -792,7 +792,8 @@ static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: case CPU_ONLINE: uv_heartbeat_enable(cpu); break; @@ -860,7 +861,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, */ void uv_cpu_init(void) { - /* CPU 0 initilization will be done via uv_system_init. */ + /* CPU 0 initialization will be done via uv_system_init. */ if (!uv_blade_info) return; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 052c9c3026cc..9307f182fe30 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1088,7 +1088,7 @@ static int apm_get_battery_status(u_short which, u_short *status, * @device: identity of device * @enable: on/off * - * Activate or deactive power management on either a specific device + * Activate or deactivate power management on either a specific device * or the entire system (%APM_DEVICE_ALL). */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 0d373d7affc8..4a8697f7d4ef 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5026a13356c4..6e47e3a916f1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -85,7 +85,7 @@ static void init_amd_k5(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 /* * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * of the Elan at 0x000df000. Unfortunately, one of the Linux * drivers subsequently pokes it, and changes the CPU speed. * Workaround : Remove the unneeded alias. */ @@ -309,7 +309,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - nodes_per_socket = ((ecx >> 8) & 7) + 1; node_id = ecx & 7; /* get compute unit information */ @@ -320,7 +319,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; node_id = value & 7; } else return; @@ -522,6 +520,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes_per_socket = ((value >> 3) & 7) + 1; + } } static void early_init_amd(struct cpuinfo_x86 *c) @@ -539,6 +549,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_sched_clock_stable(); } + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 06ad72383b4e..8394b3d1f94f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -692,7 +692,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_F_1_EDX] = edx; - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || + ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || + (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; } @@ -968,7 +970,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); - /* Clear/Set all flags overriden by options, after probe */ + /* Clear/Set all flags overridden by options, after probe */ for (i = 0; i < NCAPINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; @@ -1028,7 +1030,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_pku(c); /* - * Clear/Set all flags overriden by options, need do it + * Clear/Set all flags overridden by options, need do it * before following smp all cpus cap AND. */ for (i = 0; i < NCAPINTS; i++) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fcbcb2f678ca..19f57360dfd2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(mtrr_state); * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set - * to 1 during BIOS initalization of the fixed MTRRs, then cleared to + * to 1 during BIOS initialization of the fixed MTRRs, then cleared to * 0 for operation." */ static inline void k8_check_syscfg_dram_mod_en(void) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 21bf92490a7b..8a121991e5ba 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -287,7 +287,7 @@ static __init void early_pci_serial_init(char *s) } /* - * Lastly, initalize the hardware + * Lastly, initialize the hardware */ if (*s) { if (strcmp(s, "nocfg") == 0) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 0bc3490420c5..8bd1c003942a 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,7 +8,7 @@ /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature - * capabilites supported by the xsave. + * capabilities supported by the xsave. */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 702547ce33c9..d036cfb4495d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1,5 +1,5 @@ /* - * Code for replacing ftrace calls with jumps. + * Dynamic function tracing support. * * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> * diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index be0ebbb6d1d1..a1f0e4a5c47e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -717,7 +717,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, struct hpet_work_struct work; struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); - switch (action & 0xf) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 37dae792dbbe..589b3193f102 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) SYSCALL_DEFINE1(iopl, unsigned int, level) { struct pt_regs *regs = current_pt_regs(); - unsigned int old = (regs->flags >> 12) & 3; struct thread_struct *t = ¤t->thread; + /* + * Careful: the IOPL bits in regs->flags are undefined under Xen PV + * and changing them has no effect. + */ + unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + if (level > 3) return -EINVAL; /* Trying to gain more privileges? */ @@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - t->iopl = level << 12; + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | + (level << X86_EFLAGS_IOPL_BIT); + t->iopl = level << X86_EFLAGS_IOPL_BIT; set_iopl_mask(t->iopl); return 0; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 0f8a6bbaaa44..2af478e3fd4e 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -271,7 +271,7 @@ static int bzImage64_probe(const char *buf, unsigned long len) int ret = -ENOEXEC; struct setup_header *header; - /* kernel should be atleast two sectors long */ + /* kernel should be at least two sectors long */ if (len < 2 * 512) { pr_err("File is too short to be a bzImage\n"); return ret; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ed15cd486d06..2da6ee9ae69b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -609,9 +609,9 @@ static struct notifier_block kgdb_notifier = { }; /** - * kgdb_arch_init - Perform any architecture specific initalization. + * kgdb_arch_init - Perform any architecture specific initialization. * - * This function will handle the initalization of any architecture + * This function will handle the initialization of any architecture * specific callbacks. */ int kgdb_arch_init(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 47190bd399e7..807950860fb7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,7 @@ #include <linux/kprobes.h> #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/swait.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -91,14 +92,14 @@ static void kvm_io_delay(void) struct kvm_task_sleep_node { struct hlist_node link; - wait_queue_head_t wq; + struct swait_queue_head wq; u32 token; int cpu; bool halted; }; static struct kvm_task_sleep_head { - spinlock_t lock; + raw_spinlock_t lock; struct hlist_head list; } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token) u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; - DEFINE_WAIT(wait); + DECLARE_SWAITQUEUE(wait); rcu_irq_enter(); - spin_lock(&b->lock); + raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); kfree(e); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); rcu_irq_exit(); return; @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); n.halted = is_idle_task(current) || preempt_count() > 1; - init_waitqueue_head(&n.wq); + init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); for (;;) { if (!n.halted) - prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token) } } if (!n.halted) - finish_wait(&n.wq, &wait); + finish_swait(&n.wq, &wait); rcu_irq_exit(); return; @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (waitqueue_active(&n->wq)) - wake_up(&n->wq); + else if (swait_active(&n->wq)) + swake_up(&n->wq); } static void apf_task_wake_all(void) @@ -189,14 +190,14 @@ static void apf_task_wake_all(void) for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; - spin_lock(&b->lock); + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { struct kvm_task_sleep_node *n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); } } @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token) } again: - spin_lock(&b->lock); + raw_spin_lock(&b->lock); n = _find_apf_task(b, token); if (!n) { /* @@ -225,17 +226,17 @@ again: * Allocation failed! Busy wait while other cpu * handles async PF. */ - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); cpu_relax(); goto again; } n->token = token; n->cpu = smp_processor_id(); - init_waitqueue_head(&n->wq); + init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else apf_task_wake_one(n); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -486,7 +487,7 @@ void __init kvm_guest_init(void) paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) - spin_lock_init(&async_pf_sleepers[i].lock); + raw_spin_lock_init(&async_pf_sleepers[i].lock); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 72cef58693c7..1d39bfbd26bb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -226,7 +226,7 @@ static void kvm_setup_secondary_clock(void) * registered memory location. If the guest happens to shutdown, this memory * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writting anything + * kind of shutdown from our side, we unregister the clock by writing anything * that does not have the 'enable' bit set in the msr */ #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 776229e98202..6cbab31ac23a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -48,6 +48,7 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> +#include <asm/xen/hypervisor.h> asmlinkage extern void ret_from_fork(void); @@ -413,6 +414,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); +#ifdef CONFIG_XEN + /* + * On Xen PV, IOPL bits in pt_regs->flags have no effect, and + * current_pt_regs()->flags may not match the current task's + * intended IOPL. We need to switch it manually. + */ + if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && + prev->iopl != next->iopl)) + xen_set_iopl_mask(next->iopl); +#endif + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but @@ -478,7 +490,7 @@ void set_personality_ia32(bool x32) if (current->mm) current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; - /* is_compat_task() uses the presence of the x32 + /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ current_thread_info()->status &= ~TS_COMPAT; } else { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 643dbdccf4bc..b2c99f811c3f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -274,11 +274,6 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - if (pkg < __max_logical_packages) { - set_bit(pkg, logical_package_map); - physical_to_logical_pkg[pkg] = pkg; - goto found; - } new = find_first_zero_bit(logical_package_map, __max_logical_packages); if (new >= __max_logical_packages) { physical_to_logical_pkg[pkg] = -1; @@ -317,9 +312,27 @@ static void __init smp_init_package_map(void) /* * Today neither Intel nor AMD support heterogenous systems. That * might change in the future.... + * + * While ideally we'd want '* smp_num_siblings' in the below @ncpus + * computation, this won't actually work since some Intel BIOSes + * report inconsistent HT data when they disable HT. + * + * In particular, they reduce the APIC-IDs to only include the cores, + * but leave the CPUID topology to say there are (2) siblings. + * This means we don't know how many threads there will be until + * after the APIC enumeration. + * + * By not including this we'll sometimes over-estimate the number of + * logical packages by the amount of !present siblings, but this is + * still better than MAX_LOCAL_APIC. + * + * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited + * on the command line leading to a similar issue as the HT disable + * problem because the hyperthreads are usually enumerated after the + * primary cores. */ - ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings; - __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); + ncpus = boot_cpu_data.x86_max_cores; + __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); /* * Possibly larger than what we need as the number of apic ids per diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 56380440d862..c9c4c7ce3eb2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -881,7 +881,7 @@ void tsc_restore_sched_clock_state(void) local_irq_save(flags); /* - * We're comming out of suspend, there's no concurrency yet; don't + * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ @@ -1306,11 +1306,15 @@ void __init tsc_init(void) unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); + struct cpumask *mask = topology_core_cpumask(cpu); if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (!mask) + return 0; + + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0029644bf09c..8efb839948e5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 7, 0); + if (best) { + /* Update OSPKE bit */ + if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) { + best->ecx &= ~F(OSPKE); + if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) + best->ecx |= F(OSPKE); + } + } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; @@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = + const u32 kvm_cpuid_1_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | @@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = + const u32 kvm_cpuid_8000_0001_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | @@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = + const u32 kvm_cpuid_1_ecx_x86_features = /* NOTE: MONITOR (and MWAIT) are emulated as NOP, * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | @@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = + const u32 kvm_cpuid_8000_0001_ecx_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = + const u32 kvm_cpuid_C000_0001_edx_x86_features = F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = + const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ - const u32 kvm_supported_word10_x86_features = + const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, (u32)0xd); break; case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); + entry->edx &= kvm_cpuid_1_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_1_EDX); + entry->ecx &= kvm_cpuid_1_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_1_ECX); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); - } else + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled) + entry->ecx &= ~F(PKU); + } else { entry->ebx = 0; + entry->ecx = 0; + } entry->eax = 0; - entry->ecx = 0; entry->edx = 0; break; } @@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, do_cpuid_1_ent(&entry[i], function, idx); if (idx == 1) { - entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; entry[i].ebx = 0; if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) entry[i].ebx = @@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); + entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_8000_0001_EDX); + entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); + entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 66a6581724ad..e17a74b1d852 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ecx & bit(X86_FEATURE_PKU)); +} + static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index e1e89ee4af75..762cdf2595f9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } +static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_pkru(vcpu); +} + static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c512f095cdac..70e95d097ef1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -479,7 +479,7 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { /* - * Always atomicly update spte if it can be updated + * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. @@ -550,7 +550,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) /* * For the spte updated out of mmu-lock is safe, since - * we always atomicly update it, see the comments in + * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ if (spte_is_locklessly_modifiable(old_spte) && @@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); - vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ - smp_mb(); + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) @@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) * reads to sptes. If it does, kvm_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ - smp_mb(); - vcpu->mode = OUTSIDE_GUEST_MODE; + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } @@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, return; /* - * wmb: make sure everyone sees our modifications to the page tables - * rmb: make sure we see changes to vcpu->mode - */ - smp_mb(); - - /* - * Wait for all vcpus to exit guest mode and/or lockless shadow - * page table walks. + * We need to make sure everyone sees our modifications to + * the page tables and see changes to vcpu->mode here. The barrier + * in the kvm_flush_remote_tlbs() achieves this. This pairs + * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. + * + * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit + * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); @@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } +/* +* PKU is an additional mechanism by which the paging controls access to +* user-mode addresses based on the value in the PKRU register. Protection +* key violations are reported through a bit in the page fault error code. +* Unlike other bits of the error code, the PK bit is not known at the +* call site of e.g. gva_to_gpa; it must be computed directly in +* permission_fault based on two bits of PKRU, on some machine state (CR4, +* CR0, EFER, CPL), and on other bits of the error code and the page tables. +* +* In particular the following conditions come from the error code, the +* page tables and the machine state: +* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 +* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) +* - PK is always zero if U=0 in the page tables +* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. +* +* The PKRU bitmask caches the result of these four conditions. The error +* code (minus the P bit) and the page table's U bit form an index into the +* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed +* with the two bits of the PKRU register corresponding to the protection key. +* For the first three conditions above the bits will be 00, thus masking +* away both AD and WD. For all reads or if the last condition holds, WD +* only will be masked away. +*/ +static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept) +{ + unsigned bit; + bool wp; + + if (ept) { + mmu->pkru_mask = 0; + return; + } + + /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + mmu->pkru_mask = 0; + return; + } + + wp = is_write_protection(vcpu); + + for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { + unsigned pfec, pkey_bits; + bool check_pkey, check_write, ff, uf, wf, pte_user; + + pfec = bit << 1; + ff = pfec & PFERR_FETCH_MASK; + uf = pfec & PFERR_USER_MASK; + wf = pfec & PFERR_WRITE_MASK; + + /* PFEC.RSVD is replaced by ACC_USER_MASK. */ + pte_user = pfec & PFERR_RSVD_MASK; + + /* + * Only need to check the access which is not an + * instruction fetch and is to a user page. + */ + check_pkey = (!ff && pte_user); + /* + * write access is controlled by PKRU if it is a + * user access or CR0.WP = 1. + */ + check_write = check_pkey && wf && (uf || wp); + + /* PKRU.AD stops both read and write access. */ + pkey_bits = !!check_pkey; + /* PKRU.WD stops write access. */ + pkey_bits |= (!!check_write) << 1; + + mmu->pkru_mask |= (pkey_bits & 3) << pfec; + } +} + static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; @@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); @@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; @@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); + update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); + update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 58fe98a0a526..b70df72e2b33 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -10,10 +10,11 @@ #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 +#define PT_USER_SHIFT 2 #define PT_PRESENT_MASK (1ULL << 0) #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) +#define PT_USER_MASK (1ULL << PT_USER_SHIFT) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 @@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) } /* - * Will a fault with a given page-fault error code (pfec) cause a permission - * fault with the given access (in ACC_* format)? + * Check if a given access (described through the I/D, W/R and U/S bits of a + * page fault error code pfec) causes a permission fault with the given PTE + * access rights (in ACC_* format). + * + * Return zero if the access does not fault; return the page fault error code + * if the access faults. */ -static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - unsigned pte_access, unsigned pfec) +static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pte_pkey, + unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + bool fault = (mmu->permissions[index] >> pte_access) & 1; + + WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + pfec |= PFERR_PRESENT_MASK; + + if (unlikely(mmu->pkru_mask)) { + u32 pkru_bits, offset; + + /* + * PKRU defines 32 bits, there are 16 domains and 2 + * attribute bits per domain in pkru. pte_pkey is the + * index of the protection domain, so pte_pkey * 2 is + * is the index of the first bit for the domain. + */ + pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + + /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ + offset = pfec - 1 + + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); - WARN_ON(pfec & PFERR_RSVD_MASK); + pkru_bits &= mmu->pkru_mask >> offset; + pfec |= -pkru_bits & PFERR_PK_MASK; + fault |= (pkru_bits != 0); + } - return (mmu->permissions[index] >> pte_access) & 1; + return -(uint32_t)fault & pfec; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 11f76436f74f..b431539c3714 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + struct kvm_memory_slot *slot; + int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot) + return false; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e159a8185ad9..1d971c7553c3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, return 0; } +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned pkeys = 0; +#if PTTYPE == 64 + pte_t pte = {.pte = gpte}; + + pkeys = pte_flags_pkey(pte_flags(pte)); +#endif + return pkeys; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty; + unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -359,10 +370,10 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { - errcode |= PFERR_PRESENT_MASK; + pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); + errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + if (unlikely(errcode)) goto error; - } gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; @@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + /* + * Update spte before increasing tlbs_dirty to make + * sure no tlb flush is lost after spte is zapped; see + * the comments in kvm_flush_remote_tlbs(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } @@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); + /* + * The same as above where we are doing + * prefetch_invalid_gpte(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95070386d599..31346a3f20a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static u32 svm_get_pkru(struct kvm_vcpu *vcpu) +{ + return 0; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + + .get_pkru = svm_get_pkru, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 75173efccac5..ee1c8a93871c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -598,6 +598,10 @@ struct vcpu_vmx { struct page *pml_pg; u64 current_tsc_ratio; + + bool guest_pkru_valid; + u32 guest_pkru; + u32 host_pkru; }; enum segment_cache_field { @@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); } + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->guest_pkru; +} + static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!enable_unrestricted_guest && !is_paging(vcpu)) /* - * SMEP/SMAP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode without - * unrestricted guest. - * To emulate this behavior, SMEP/SMAP needs to be manually - * disabled when guest switches to non-paging mode. + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -5506,7 +5528,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as approriate for clts instruction exit. */ +/* called to set cr0 as appropriate for clts instruction exit. */ static void handle_clts(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { @@ -7245,7 +7267,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the approriate number of + * bit (field_value), and then copies only the appropriate number of * bits into the vmcs12 field. */ u64 field_value = 0; @@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + /* + * Old versions of KVM use the single-context version so we + * have to support it; just treat it the same as all-context. + */ case VMX_VPID_EXTENT_ALL_CONTEXT: __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: - /* Trap single context invalidation invvpid calls */ + /* Trap individual address invalidation invvpid calls */ BUG_ON(1); break; } @@ -8621,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + if (vmx->guest_pkru_valid) + __write_pkru(vmx->guest_pkru); + atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -8761,6 +8793,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (boot_cpu_has(X86_FEATURE_OSPKE)) { + vmx->guest_pkru = __read_pkru(); + if (vmx->guest_pkru != vmx->host_pkru) { + vmx->guest_pkru_valid = true; + __write_pkru(vmx->host_pkru); + } else + vmx->guest_pkru_valid = false; + } + + /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of * nested_run_pending, we need to re-enable this bit. @@ -10884,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + + .get_pkru = vmx_get_pkru, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7236bd3a4c3d..742d0f7d3556 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) return 1; @@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; + if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); return 0; @@ -1559,7 +1562,7 @@ static cycle_t read_tsc(void) /* * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is + * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function @@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); + /* + * currently PKRU is only applied to ept enabled guest so + * there is no pkey in EPT page table for L1 guest or EPT + * shadow page table for L2 guest. + */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, access)) { + vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. + /* + * We should set ->mode before check ->requests, + * Please see the comment in kvm_make_all_cpus_request. + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); @@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) + if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 007940faa5c6..7ce3634ab5fe 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ - | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a501fa25da41..72a576752a7e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,6 +2,9 @@ # Makefile for x86 specific library files. # +# Produces uninteresting flaky coverage. +KCOV_INSTRUMENT_delay.o := n + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index cbb8ee5830ff..2ec0b0abbfaa 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,6 +1,7 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> +#include <asm/errno.h> #include <asm/cpufeatures.h> #include <asm/alternative-asm.h> @@ -268,16 +269,16 @@ ENTRY(memcpy_mcsafe) decl %ecx jnz .L_copy_trailing_bytes - /* Copy successful. Return true */ + /* Copy successful. Return zero */ .L_done_memcpy_trap: xorq %rax, %rax ret ENDPROC(memcpy_mcsafe) .section .fixup, "ax" - /* Return false for any failure */ + /* Return -EFAULT for any failure */ .L_memcpy_mcsafe_fail: - mov $1, %rax + mov $-EFAULT, %rax ret .previous diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index c9c81227ea37..e1229ecd2a82 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -9,7 +9,7 @@ /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is - * simpler and shorter than the orignal function as well. + * simpler and shorter than the original function as well. * * rdi destination * rsi value (char) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 67cf2e1e557b..f98913258c63 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,3 +1,6 @@ +# Kernel does not boot with instrumentation of tlb.c. +KCOV_INSTRUMENT_tlb.o := n + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o gup.o setup_nx.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 9dd7e4b7fcde..82447b3fba38 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,17 +1,10 @@ #include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/sort.h> #include <asm/uaccess.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); static inline unsigned long -ex_insn_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} -static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; @@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip) *ip = new_ip; return 1; } - -/* - * Search one exception table for an entry corresponding to the - * given instruction address, and return the address of the entry, - * or NULL if none is found. - * We use a binary search, and thus we assume that the table is - * already sorted. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - while (first <= last) { - const struct exception_table_entry *mid; - unsigned long addr; - - mid = ((last - first) >> 1) + first; - addr = ex_insn_addr(mid); - if (addr < value) - first = mid + 1; - else if (addr > value) - last = mid - 1; - else - return mid; - } - return NULL; -} - -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - * - */ -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *x = a, *y = b; - - /* - * This value will always end up fittin in an int, because on - * both i386 and x86-64 the kernel symbol-reachable address - * space is < 2 GiB. - * - * This compare is only valid after normalization. - */ - return x->insn - y->insn; -} - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - struct exception_table_entry *p; - int i; - - /* Convert all entries to being relative to the start of the section */ - i = 0; - for (p = start; p < finish; p++) { - p->insn += i; - i += 4; - p->fixup += i; - i += 4; - p->handler += i; - i += 4; - } - - sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, NULL); - - /* Denormalize all entries */ - i = 0; - for (p = start; p < finish; p++) { - p->insn -= i; - i += 4; - p->fixup -= i; - i += 4; - p->handler -= i; - i += 4; - } -} - -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index a0a0b9861902..80476878eb4c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) /* * This covers 32-bit emulation as well as 32-bit kernels - * running on 64-bit harware. + * running on 64-bit hardware. */ if (!is_64bit_mm(mm)) return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; /* * 'x86_virt_bits' returns what the hardware is capable - * of, and returns the full >32-bit adddress space when + * of, and returns the full >32-bit address space when * running 32-bit kernels on 64-bit hardware. */ virt_space = (1ULL << boot_cpu_data.x86_virt_bits); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 04e2e7144bee..faec01e7a17d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -149,7 +149,7 @@ enum { PAT_WT = 4, /* Write Through */ PAT_WP = 5, /* Write Protected */ PAT_WB = 6, /* Write Back (default) */ - PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ + PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ }; #define CM(c) (_PAGE_CACHE_MODE_ ## c) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 1d2e6392f5fa..0e07e0968c3a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -437,7 +437,8 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) { int cpu = (unsigned long)data; - switch (action) { + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_FAILED: case CPU_ONLINE: smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c index 0ae7f2ae2296..c26cf393d35a 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c @@ -1,5 +1,5 @@ /* - * platform_bma023.c: bma023 platform data initilization file + * platform_bma023.c: bma023 platform data initialization file * * (C) Copyright 2013 Intel Corporation * diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c index 69a783689d21..c259fb6c8f4f 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c @@ -1,5 +1,5 @@ /* - * platform_emc1403.c: emc1403 platform data initilization file + * platform_emc1403.c: emc1403 platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index dccae6b0413f..52534ec29765 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -1,5 +1,5 @@ /* - * platform_gpio_keys.c: gpio_keys platform data initilization file + * platform_gpio_keys.c: gpio_keys platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c index 54226de7541a..a35cf912de43 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c @@ -1,5 +1,5 @@ /* - * platform_lis331.c: lis331 platform data initilization file + * platform_lis331.c: lis331 platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c index 2c8acbc1e9ad..6e075afa7877 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c @@ -1,5 +1,5 @@ /* - * platform_max7315.c: max7315 platform data initilization file + * platform_max7315.c: max7315 platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c index cfe9a47a1e87..ee22864bbc2f 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c @@ -1,5 +1,5 @@ /* - * platform_mpu3050.c: mpu3050 platform data initilization file + * platform_mpu3050.c: mpu3050 platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c index 9f4a775a69d6..e421106c11cf 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c @@ -1,5 +1,5 @@ /* - * platform_msic.c: MSIC platform data initilization file + * platform_msic.c: MSIC platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c index 29629397d2b3..cb3490ecb341 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c @@ -1,5 +1,5 @@ /* - * platform_msic_audio.c: MSIC audio platform data initilization file + * platform_msic_audio.c: MSIC audio platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c index f446c33df1a8..4f72193939a6 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c @@ -1,5 +1,5 @@ /* - * platform_msic_battery.c: MSIC battery platform data initilization file + * platform_msic_battery.c: MSIC battery platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c index 2a4f7b1dd917..70de5b531ba0 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c @@ -1,5 +1,5 @@ /* - * platform_msic_gpio.c: MSIC GPIO platform data initilization file + * platform_msic_gpio.c: MSIC GPIO platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c index 6497111ddb54..3d7c2011b6cf 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c @@ -1,5 +1,5 @@ /* - * platform_msic_ocd.c: MSIC OCD platform data initilization file + * platform_msic_ocd.c: MSIC OCD platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c index 83a3459bc337..038f618fbc52 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c @@ -1,5 +1,5 @@ /* - * platform_msic_power_btn.c: MSIC power btn platform data initilization file + * platform_msic_power_btn.c: MSIC power btn platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c index a351878b96bc..114a5755b1e4 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c @@ -1,5 +1,5 @@ /* - * platform_msic_thermal.c: msic_thermal platform data initilization file + * platform_msic_thermal.c: msic_thermal platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c index 65c2a9a19db4..e30cb62e3300 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c @@ -1,5 +1,5 @@ /* - * platform_pmic_gpio.c: PMIC GPIO platform data initilization file + * platform_pmic_gpio.c: PMIC GPIO platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c index 740fc757050c..b1526b95fd43 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c @@ -1,5 +1,5 @@ /* - * platform_tc35876x.c: tc35876x platform data initilization file + * platform_tc35876x.c: tc35876x platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c index 33be0b3be6e1..4f41372ce400 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c @@ -1,5 +1,5 @@ /* - * platform_tca6416.c: tca6416 platform data initilization file + * platform_tca6416.c: tca6416 platform data initialization file * * (C) Copyright 2013 Intel Corporation * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S index 3cefba1fefc8..50a4147f91fb 100644 --- a/arch/x86/purgatory/stack.S +++ b/arch/x86/purgatory/stack.S @@ -8,7 +8,7 @@ */ /* A stack for the loaded kernel. - * Seperate and in the data section so it can be prepopulated. + * Separate and in the data section so it can be prepopulated. */ .data .balign 4096 diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 053abe7b0ef7..b95964610ea7 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -9,6 +9,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index d5644bbe8cba..9fd24846d094 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -14,26 +14,24 @@ int fb_is_primary_device(struct fb_info *info) { struct device *device = info->device; - struct pci_dev *pci_dev = NULL; struct pci_dev *default_device = vga_default_device(); - struct resource *res = NULL; + struct pci_dev *pci_dev; + struct resource *res; - if (device) - pci_dev = to_pci_dev(device); - - if (!pci_dev) + if (!device || !dev_is_pci(device)) return 0; + pci_dev = to_pci_dev(device); + if (default_device) { if (pci_dev == default_device) return 1; - else - return 0; + return 0; } - res = &pci_dev->resource[PCI_ROM_RESOURCE]; + res = pci_dev->resource + PCI_ROM_RESOURCE; - if (res && res->flags & IORESOURCE_ROM_SHADOW) + if (res->flags & IORESOURCE_ROM_SHADOW) return 1; return 0; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2379a5a88504..880862c7d9dd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -962,7 +962,7 @@ static void xen_load_sp0(struct tss_struct *tss, tss->x86_tss.sp0 = thread->sp0; } -static void xen_set_iopl_mask(unsigned mask) +void xen_set_iopl_mask(unsigned mask) { struct physdev_set_iopl set_iopl; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c913ca4f6958..478a2de543a5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1256,7 +1256,7 @@ static void __init xen_pagetable_cleanhighmap(void) xen_cleanhighmap(addr, addr + size); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); #ifdef DEBUG - /* This is superflous and is not neccessary, but you know what + /* This is superfluous and is not necessary, but you know what * lets do it. The MODULES_VADDR -> MODULES_END should be clear of * anything at this stage. */ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); @@ -1474,7 +1474,7 @@ static void xen_write_cr3(unsigned long cr3) /* * At the start of the day - when Xen launches a guest, it has already * built pagetables for the guest. We diligently look over them - * in xen_setup_kernel_pagetable and graft as appropiate them in the + * in xen_setup_kernel_pagetable and graft as appropriate them in the * init_level4_pgt and its friends. Then when we are happy we load * the new init_level4_pgt - and continue on. * @@ -2792,7 +2792,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, struct remap_data *rmd = data; pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot)); - /* If we have a contigious range, just update the mfn itself, + /* If we have a contiguous range, just update the mfn itself, else update pointer to be "next mfn". */ if (rmd->contiguous) (*rmd->mfn)++; @@ -2833,7 +2833,7 @@ static int do_remap_gfn(struct vm_area_struct *vma, rmd.mfn = gfn; rmd.prot = prot; - /* We use the err_ptr to indicate if there we are doing a contigious + /* We use the err_ptr to indicate if there we are doing a contiguous * mapping or a discontigious mapping. */ rmd.contiguous = !err_ptr; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index de93b20fa0d2..7f8d8abf4c1a 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -26,7 +26,7 @@ (1 << XENFEAT_auto_translated_physmap) | \ (1 << XENFEAT_supervisor_mode_kernel) | \ (1 << XENFEAT_hvm_callback_vector)) -/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that +/* The XENFEAT_writable_page_tables is not stricly necessary as we set that * up regardless whether this CONFIG option is enabled or not, but it * clarifies what the right flags need to be. */ |