summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/compressed/kaslr.c2
-rw-r--r--arch/x86/boot/compressed/misc.h4
-rw-r--r--arch/x86/boot/string.c3
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl4
-rw-r--r--arch/x86/events/core.c13
-rw-r--r--arch/x86/events/intel/core.c154
-rw-r--r--arch/x86/events/intel/uncore.c1
-rw-r--r--arch/x86/events/intel/uncore.h12
-rw-r--r--arch/x86/events/intel/uncore_snb.c4
-rw-r--r--arch/x86/events/perf_event.h17
-rw-r--r--arch/x86/hyperv/hv_init.c14
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/cpu_device_id.h31
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h52
-rw-r--r--arch/x86/include/asm/kvm_vcpu_regs.h25
-rw-r--r--arch/x86/include/asm/mce.h7
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/page_64_types.h4
-rw-r--r--arch/x86/include/asm/processor-cyrix.h21
-rw-r--r--arch/x86/include/asm/realmode.h6
-rw-r--r--arch/x86/include/asm/string_32.h104
-rw-r--r--arch/x86/include/asm/string_64.h15
-rw-r--r--arch/x86/include/asm/unwind.h6
-rw-r--r--arch/x86/include/asm/xen/hypercall.h13
-rw-r--r--arch/x86/include/uapi/asm/Kbuild3
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/aperture_64.c20
-rw-r--r--arch/x86/kernel/apic/io_apic.c5
-rw-r--r--arch/x86/kernel/cpu/cyrix.c14
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c62
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c10
-rw-r--r--arch/x86/kernel/cpu/mce/core.c30
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c12
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h16
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c185
-rw-r--r--arch/x86/kernel/e820.c10
-rw-r--r--arch/x86/kernel/ftrace.c42
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c1
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c14
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/mpparse.c4
-rw-r--r--arch/x86/kernel/setup_percpu.c10
-rw-r--r--arch/x86/kernel/unwind_frame.c25
-rw-r--r--arch/x86/kernel/unwind_orc.c17
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/hyperv.c11
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/mmu.c516
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/mmutrace.h44
-rw-r--r--arch/x86/kvm/page_track.c2
-rw-r--r--arch/x86/kvm/svm.c152
-rw-r--r--arch/x86/kvm/vmx/nested.c134
-rw-r--r--arch/x86/kvm/vmx/vmcs.h1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S167
-rw-r--r--arch/x86/kvm/vmx/vmx.c207
-rw-r--r--arch/x86/kvm/vmx/vmx.h21
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c14
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/numa.c12
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pti.c4
-rw-r--r--arch/x86/pci/fixup.c16
-rw-r--r--arch/x86/platform/efi/quirks.c2
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c3
-rw-r--r--arch/x86/realmode/init.c11
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/xen/mmu_pv.c13
-rw-r--r--arch/x86/xen/p2m.c11
-rw-r--r--arch/x86/xen/setup.c13
85 files changed, 1434 insertions, 1111 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 90b562a34d65..5ad92419be19 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,7 +14,6 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION
select CLKSRC_I8253
select CLONE_BACKWARDS
- select HAVE_GENERIC_DMA_COHERENT
select MODULES_USE_ELF_REL
select OLD_SIGACTION
@@ -2218,14 +2217,8 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
If unsure, leave at the default value.
config HOTPLUG_CPU
- bool "Support for hot-pluggable CPUs"
+ def_bool y
depends on SMP
- ---help---
- Say Y here to allow turning CPUs off and on. CPUs can be
- controlled through /sys/devices/system/cpu.
- ( Note: power management support will enable this option
- automatically on SMP systems. )
- Say N if you want to disable CPU hotplug.
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d8b9d8ca4f8..a587805c6687 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -219,8 +219,12 @@ ifdef CONFIG_RETPOLINE
# Additionally, avoid generating expensive indirect jumps which
# are subject to retpolines for small number of switch cases.
# clang turns off jump table generation by default when under
- # retpoline builds, however, gcc does not for x86.
- KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
+ # retpoline builds, however, gcc does not for x86. This has
+ # only been fixed starting from gcc stable version 8.4.0 and
+ # onwards, but not for older ones. See gcc bug #86952.
+ ifndef CONFIG_CC_IS_CLANG
+ KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables)
+ endif
endif
archscripts: scripts_basic
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index fa0332dda9f2..2e53c056ba20 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -697,8 +697,8 @@ static bool process_mem_region(struct mem_vector *region,
return 1;
}
}
- return 0;
#endif
+ return 0;
}
#ifdef CONFIG_EFI
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index fd13655e0f9b..d2f184165934 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -120,8 +120,6 @@ static inline void console_init(void)
void set_sev_encryption_mask(void);
-#endif
-
/* acpi.c */
#ifdef CONFIG_ACPI
acpi_physical_address get_rsdp_addr(void);
@@ -135,3 +133,5 @@ int count_immovable_mem_regions(void);
#else
static inline int count_immovable_mem_regions(void) { return 0; }
#endif
+
+#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 315a67b8896b..90154df8f125 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -13,8 +13,9 @@
*/
#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#include <linux/errno.h>
+#include <linux/limits.h>
#include <asm/asm.h>
#include "ctype.h"
#include "string.h"
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 955ab6a3b61f..1f9607ed087c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -429,3 +429,7 @@
421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64
422 i386 futex_time64 sys_futex __ia32_sys_futex
423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval
+424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal
+425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
+426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
+427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 2ae92fddb6d5..92ee0b4378d4 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,10 @@
334 common rseq __x64_sys_rseq
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
+424 common pidfd_send_signal __x64_sys_pidfd_send_signal
+425 common io_uring_setup __x64_sys_io_uring_setup
+426 common io_uring_enter __x64_sys_io_uring_enter
+427 common io_uring_register __x64_sys_io_uring_register
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b684f0294f35..e2b1447192a8 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1995,7 +1995,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
*/
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
- kfree(cpuc->shared_regs);
+ intel_cpuc_finish(cpuc);
kfree(cpuc);
}
@@ -2007,14 +2007,11 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
if (!cpuc)
return ERR_PTR(-ENOMEM);
-
- /* only needed, if we have extra_regs */
- if (x86_pmu.extra_regs) {
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- goto error;
- }
cpuc->is_fake = 1;
+
+ if (intel_cpuc_prepare(cpuc, cpu))
+ goto error;
+
return cpuc;
error:
free_fake_cpuc(cpuc);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7ec265bacb6a..8baa441d8000 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2000,6 +2000,39 @@ static void intel_pmu_nhm_enable_all(int added)
intel_pmu_enable_all(added);
}
+static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
+{
+ u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
+
+ if (cpuc->tfa_shadow != val) {
+ cpuc->tfa_shadow = val;
+ wrmsrl(MSR_TSX_FORCE_ABORT, val);
+ }
+}
+
+static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ /*
+ * We're going to use PMC3, make sure TFA is set before we touch it.
+ */
+ if (cntr == 3 && !cpuc->is_fake)
+ intel_set_tfa(cpuc, true);
+}
+
+static void intel_tfa_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * If we find PMC3 is no longer used when we enable the PMU, we can
+ * clear TFA.
+ */
+ if (!test_bit(3, cpuc->active_mask))
+ intel_set_tfa(cpuc, false);
+
+ intel_pmu_enable_all(added);
+}
+
static void enable_counter_freeze(void)
{
update_debugctlmsr(get_debugctlmsr() |
@@ -2770,6 +2803,35 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
}
static struct event_constraint *
+dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
+{
+ WARN_ON_ONCE(!cpuc->constraint_list);
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+ struct event_constraint *cx;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ *cx = *c;
+
+ /*
+ * mark constraint as dynamic
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
int idx, struct event_constraint *c)
{
@@ -2799,27 +2861,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* only needed when constraint has not yet
* been cloned (marked dynamic)
*/
- if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
- struct event_constraint *cx;
-
- /*
- * grab pre-allocated constraint entry
- */
- cx = &cpuc->constraint_list[idx];
-
- /*
- * initialize dynamic constraint
- * with static constraint
- */
- *cx = *c;
-
- /*
- * mark constraint as dynamic, so we
- * can free it later on
- */
- cx->flags |= PERF_X86_EVENT_DYNAMIC;
- c = cx;
- }
+ c = dyn_constraint(cpuc, c, idx);
/*
* From here on, the constraint is dynamic.
@@ -3357,6 +3399,26 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return c;
}
+static bool allow_tsx_force_abort = true;
+
+static struct event_constraint *
+tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * Without TFA we must not use PMC3.
+ */
+ if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) {
+ c = dyn_constraint(cpuc, c, idx);
+ c->idxmsk64 &= ~(1ULL << 3);
+ c->weight--;
+ }
+
+ return c;
+}
+
/*
* Broadwell:
*
@@ -3410,7 +3472,7 @@ ssize_t intel_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
-struct intel_shared_regs *allocate_shared_regs(int cpu)
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
{
struct intel_shared_regs *regs;
int i;
@@ -3442,23 +3504,24 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
return c;
}
-static int intel_pmu_cpu_prepare(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
}
- if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
- cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
if (!cpuc->constraint_list)
goto err_shared_regs;
+ }
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
if (!cpuc->excl_cntrs)
goto err_constraint_list;
@@ -3480,6 +3543,11 @@ err:
return -ENOMEM;
}
+static int intel_pmu_cpu_prepare(int cpu)
+{
+ return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+}
+
static void flip_smm_bit(void *data)
{
unsigned long set = *(unsigned long *)data;
@@ -3554,9 +3622,8 @@ static void intel_pmu_cpu_starting(int cpu)
}
}
-static void free_excl_cntrs(int cpu)
+static void free_excl_cntrs(struct cpu_hw_events *cpuc)
{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct intel_excl_cntrs *c;
c = cpuc->excl_cntrs;
@@ -3564,9 +3631,10 @@ static void free_excl_cntrs(int cpu)
if (c->core_id == -1 || --c->refcnt == 0)
kfree(c);
cpuc->excl_cntrs = NULL;
- kfree(cpuc->constraint_list);
- cpuc->constraint_list = NULL;
}
+
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
}
static void intel_pmu_cpu_dying(int cpu)
@@ -3577,9 +3645,8 @@ static void intel_pmu_cpu_dying(int cpu)
disable_counter_freeze();
}
-static void intel_pmu_cpu_dead(int cpu)
+void intel_cpuc_finish(struct cpu_hw_events *cpuc)
{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct intel_shared_regs *pc;
pc = cpuc->shared_regs;
@@ -3589,7 +3656,12 @@ static void intel_pmu_cpu_dead(int cpu)
cpuc->shared_regs = NULL;
}
- free_excl_cntrs(cpu);
+ free_excl_cntrs(cpuc);
+}
+
+static void intel_pmu_cpu_dead(int cpu)
+{
+ intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -4107,8 +4179,11 @@ static struct attribute *intel_pmu_caps_attrs[] = {
NULL
};
+static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort);
+
static struct attribute *intel_pmu_attrs[] = {
&dev_attr_freeze_on_smi.attr,
+ NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */
NULL,
};
@@ -4607,6 +4682,15 @@ __init int intel_pmu_init(void)
tsx_attr = hsw_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
+
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ x86_pmu.flags |= PMU_FL_TFA;
+ x86_pmu.get_event_constraints = tfa_get_event_constraints;
+ x86_pmu.enable_all = intel_tfa_pmu_enable_all;
+ x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
+ intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr;
+ }
+
pr_cont("Skylake events, ");
name = "skylake";
break;
@@ -4758,7 +4842,7 @@ static __init int fixup_ht_bug(void)
hardlockup_detector_perf_restart();
for_each_online_cpu(c)
- free_excl_cntrs(c);
+ free_excl_cntrs(&per_cpu(cpu_hw_events, c));
cpus_read_unlock();
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d516161c00c4..9fe64c01a2e5 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -732,6 +732,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
/* fixed counters have event field hardcoded to zero */
hwc->config = 0ULL;
} else if (is_freerunning_event(event)) {
+ hwc->config = event->attr.config;
if (!check_valid_freerunning_event(box, event))
return -EINVAL;
event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index cb46d602a6b8..853a49a8ccf6 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -292,8 +292,8 @@ static inline
unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
- unsigned int idx = uncore_freerunning_idx(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
struct intel_uncore_pmu *pmu = box->pmu;
return pmu->type->freerunning[type].counter_base +
@@ -377,7 +377,7 @@ static inline
unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
return box->pmu->type->freerunning[type].bits;
}
@@ -385,7 +385,7 @@ unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
static inline int uncore_num_freerunning(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
return box->pmu->type->freerunning[type].num_counters;
}
@@ -399,8 +399,8 @@ static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
- unsigned int idx = uncore_freerunning_idx(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
return (type < uncore_num_freerunning_types(box, event)) &&
(idx < uncore_num_freerunning(box, event));
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index b12517fae77a..13493f43b247 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -442,9 +442,11 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
/* must be done before validate_group */
event->hw.event_base = base;
- event->hw.config = cfg;
event->hw.idx = idx;
+ /* Convert to standard encoding format for freerunning counters */
+ event->hw.config = ((cfg - 1) << 8) | 0x10ff;
+
/* no group validation needed, we have free running counters */
return 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7e75f474b076..a75955741c50 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -243,6 +243,11 @@ struct cpu_hw_events {
int excl_thread_id; /* 0 or 1 */
/*
+ * SKL TSX_FORCE_ABORT shadow
+ */
+ u64 tfa_shadow;
+
+ /*
* AMD specific bits
*/
struct amd_nb *amd_nb;
@@ -682,6 +687,7 @@ do { \
#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
+#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -890,7 +896,8 @@ struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event);
-struct intel_shared_regs *allocate_shared_regs(int cpu);
+extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu);
+extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
int intel_pmu_init(void);
@@ -1026,9 +1033,13 @@ static inline int intel_pmu_init(void)
return 0;
}
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ return 0;
+}
+
+static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
{
- return NULL;
}
static inline int is_ht_workaround_enabled(void)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 7abb09e2eeb8..e4ba467a9fc6 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -96,15 +96,20 @@ void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
u32 hv_max_vp_index;
+EXPORT_SYMBOL_GPL(hv_max_vp_index);
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
void **input_arg;
+ struct page *pg;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- *input_arg = page_address(alloc_page(GFP_KERNEL));
+ pg = alloc_page(GFP_KERNEL);
+ if (unlikely(!pg))
+ return -ENOMEM;
+ *input_arg = page_address(pg);
hv_get_vp_index(msr_vp_index);
@@ -406,6 +411,13 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ /*
+ * Reset hypercall page reference before reset the page,
+ * let hypercall operations fail safely rather than
+ * panic the kernel for using invalid hypercall page
+ */
+ hv_hypercall_pg = NULL;
+
/* Reset the hypercall page */
hypercall_msr.as_uint64 = 0;
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index ad7b210aa3f6..d153d570bb04 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -36,13 +36,7 @@
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
-/* Technically wrong, but this avoids compilation errors on some gcc
- versions. */
-#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
-#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
-#endif
#define ADDR BITOP_ADDR(addr)
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index 3417110574c1..31c379c1da41 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CPU_DEVICE_ID
-#define _CPU_DEVICE_ID 1
+#ifndef _ASM_X86_CPU_DEVICE_ID
+#define _ASM_X86_CPU_DEVICE_ID
/*
* Declare drivers belonging to specific x86 CPUs
@@ -9,8 +9,6 @@
#include <linux/mod_devicetable.h>
-extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
-
/*
* Match specific microcode revisions.
*
@@ -22,21 +20,22 @@ extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
*/
struct x86_cpu_desc {
- __u8 x86_family;
- __u8 x86_vendor;
- __u8 x86_model;
- __u8 x86_stepping;
- __u32 x86_microcode_rev;
+ u8 x86_family;
+ u8 x86_vendor;
+ u8 x86_model;
+ u8 x86_stepping;
+ u32 x86_microcode_rev;
};
-#define INTEL_CPU_DESC(mod, step, rev) { \
- .x86_family = 6, \
- .x86_vendor = X86_VENDOR_INTEL, \
- .x86_model = mod, \
- .x86_stepping = step, \
- .x86_microcode_rev = rev, \
+#define INTEL_CPU_DESC(model, stepping, revision) { \
+ .x86_family = 6, \
+ .x86_vendor = X86_VENDOR_INTEL, \
+ .x86_model = (model), \
+ .x86_stepping = (stepping), \
+ .x86_microcode_rev = (revision), \
}
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
-#endif
+#endif /* _ASM_X86_CPU_DEVICE_ID */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ce95b8cbd229..0e56ff7e4848 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -112,8 +112,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
test_cpu_cap(c, bit))
#define this_cpu_has(bit) \
- (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, \
+ (unsigned long __percpu *)&cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6d6122524711..981ff9479648 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -344,6 +344,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 180373360e34..159b5988292f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
+#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
#define KVM_MAX_VCPUS 288
@@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define ASYNC_PF_PER_VCPU 64
enum kvm_reg {
- VCPU_REGS_RAX = 0,
- VCPU_REGS_RCX = 1,
- VCPU_REGS_RDX = 2,
- VCPU_REGS_RBX = 3,
- VCPU_REGS_RSP = 4,
- VCPU_REGS_RBP = 5,
- VCPU_REGS_RSI = 6,
- VCPU_REGS_RDI = 7,
+ VCPU_REGS_RAX = __VCPU_REGS_RAX,
+ VCPU_REGS_RCX = __VCPU_REGS_RCX,
+ VCPU_REGS_RDX = __VCPU_REGS_RDX,
+ VCPU_REGS_RBX = __VCPU_REGS_RBX,
+ VCPU_REGS_RSP = __VCPU_REGS_RSP,
+ VCPU_REGS_RBP = __VCPU_REGS_RBP,
+ VCPU_REGS_RSI = __VCPU_REGS_RSI,
+ VCPU_REGS_RDI = __VCPU_REGS_RDI,
#ifdef CONFIG_X86_64
- VCPU_REGS_R8 = 8,
- VCPU_REGS_R9 = 9,
- VCPU_REGS_R10 = 10,
- VCPU_REGS_R11 = 11,
- VCPU_REGS_R12 = 12,
- VCPU_REGS_R13 = 13,
- VCPU_REGS_R14 = 14,
- VCPU_REGS_R15 = 15,
+ VCPU_REGS_R8 = __VCPU_REGS_R8,
+ VCPU_REGS_R9 = __VCPU_REGS_R9,
+ VCPU_REGS_R10 = __VCPU_REGS_R10,
+ VCPU_REGS_R11 = __VCPU_REGS_R11,
+ VCPU_REGS_R12 = __VCPU_REGS_R12,
+ VCPU_REGS_R13 = __VCPU_REGS_R13,
+ VCPU_REGS_R14 = __VCPU_REGS_R14,
+ VCPU_REGS_R15 = __VCPU_REGS_R15,
#endif
VCPU_REGS_RIP,
NR_VCPU_REGS
@@ -252,14 +253,14 @@ struct kvm_mmu_memory_cache {
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
* by indirect shadow page can not be more than 15 bits.
*
- * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
* @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
*/
union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
- unsigned cr4_pae:1;
+ unsigned gpte_is_8_bytes:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
@@ -319,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
+ bool mmio_cached;
/*
* The following two entries are used to key the shadow page in the
@@ -333,10 +335,6 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-
- /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
- unsigned long mmu_valid_gen;
-
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
@@ -352,6 +350,7 @@ struct kvm_mmu_page {
};
struct kvm_pio_request {
+ unsigned long linear_rip;
unsigned long count;
int in;
int port;
@@ -570,6 +569,7 @@ struct kvm_vcpu_arch {
bool tpr_access_reporting;
u64 ia32_xss;
u64 microcode_version;
+ u64 arch_capabilities;
/*
* Paging state of the vcpu
@@ -848,13 +848,11 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
- unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
- struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
@@ -1196,6 +1194,8 @@ struct kvm_x86_ops {
int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
+
+ bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1255,8 +1255,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
+unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h
new file mode 100644
index 000000000000..1af2cb59233b
--- /dev/null
+++ b/arch/x86/include/asm/kvm_vcpu_regs.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_VCPU_REGS_H
+#define _ASM_X86_KVM_VCPU_REGS_H
+
+#define __VCPU_REGS_RAX 0
+#define __VCPU_REGS_RCX 1
+#define __VCPU_REGS_RDX 2
+#define __VCPU_REGS_RBX 3
+#define __VCPU_REGS_RSP 4
+#define __VCPU_REGS_RBP 5
+#define __VCPU_REGS_RSI 6
+#define __VCPU_REGS_RDI 7
+
+#ifdef CONFIG_X86_64
+#define __VCPU_REGS_R8 8
+#define __VCPU_REGS_R9 9
+#define __VCPU_REGS_R10 10
+#define __VCPU_REGS_R11 11
+#define __VCPU_REGS_R12 12
+#define __VCPU_REGS_R13 13
+#define __VCPU_REGS_R14 14
+#define __VCPU_REGS_R15 15
+#endif
+
+#endif /* _ASM_X86_KVM_VCPU_REGS_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c1a812bd5a27..22d05e3835f0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -48,6 +48,7 @@
#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
+#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */
/*
* McaX field if set indicates a given bank supports MCA extensions:
@@ -307,11 +308,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c2446fd1..ca5bc0eacb95 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -666,6 +666,12 @@
#define MSR_IA32_TSC_DEADLINE 0x000006E0
+
+#define MSR_TSX_FORCE_ABORT 0x0000010F
+
+#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
+#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 0ce558a8150d..8f657286d599 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -7,11 +7,7 @@
#endif
#ifdef CONFIG_KASAN
-#ifdef CONFIG_KASAN_EXTRA
-#define KASAN_STACK_ORDER 2
-#else
#define KASAN_STACK_ORDER 1
-#endif
#else
#define KASAN_STACK_ORDER 0
#endif
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
index aaedd73ea2c6..df700a6cc869 100644
--- a/arch/x86/include/asm/processor-cyrix.h
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -3,19 +3,6 @@
* NSC/Cyrix CPU indexed register access. Must be inlined instead of
* macros to ensure correct access ordering
* Access order is always 0x22 (=offset), 0x23 (=value)
- *
- * When using the old macros a line like
- * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
- * gets expanded to:
- * do {
- * outb((CX86_CCR2), 0x22);
- * outb((({
- * outb((CX86_CCR2), 0x22);
- * inb(0x23);
- * }) | 0x88), 0x23);
- * } while (0);
- *
- * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
*/
static inline u8 getCx86(u8 reg)
@@ -29,11 +16,3 @@ static inline void setCx86(u8 reg, u8 data)
outb(reg, 0x22);
outb(data, 0x23);
}
-
-#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
-
-#define setCx86_old(reg, data) do { \
- outb((reg), 0x22); \
- outb((data), 0x23); \
-} while (0)
-
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 63b3393bd98e..c53682303c9c 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -77,7 +77,11 @@ static inline size_t real_mode_size_needed(void)
return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
}
-void set_real_mode_mem(phys_addr_t mem, size_t size);
+static inline void set_real_mode_mem(phys_addr_t mem)
+{
+ real_mode_header = (struct real_mode_header *) __va(mem);
+}
+
void reserve_real_mode(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 55d392c6bd29..f74362b05619 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,14 +179,7 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
* No 3D Now!
*/
-#if (__GNUC__ >= 4)
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
-#else
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy((t), (f), (n)) \
- : __memcpy((t), (f), (n)))
-#endif
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
@@ -216,29 +209,6 @@ static inline void *__memset_generic(void *s, char c, size_t count)
/* we might want to write optimized versions of these later */
#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
-/*
- * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static __always_inline
-void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
- int d0, d1;
- asm volatile("rep ; stosl\n\t"
- "testb $2,%b3\n\t"
- "je 1f\n\t"
- "stosw\n"
- "1:\ttestb $1,%b3\n\t"
- "je 2f\n\t"
- "stosb\n"
- "2:"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
- : "memory");
- return s;
-}
-
/* Added by Gertjan van Wingerde to make minix and sysv module work */
#define __HAVE_ARCH_STRNLEN
extern size_t strnlen(const char *s, size_t count);
@@ -247,72 +217,6 @@ extern size_t strnlen(const char *s, size_t count);
#define __HAVE_ARCH_STRSTR
extern char *strstr(const char *cs, const char *ct);
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static __always_inline
-void *__constant_c_and_count_memset(void *s, unsigned long pattern,
- size_t count)
-{
- switch (count) {
- case 0:
- return s;
- case 1:
- *(unsigned char *)s = pattern & 0xff;
- return s;
- case 2:
- *(unsigned short *)s = pattern & 0xffff;
- return s;
- case 3:
- *(unsigned short *)s = pattern & 0xffff;
- *((unsigned char *)s + 2) = pattern & 0xff;
- return s;
- case 4:
- *(unsigned long *)s = pattern;
- return s;
- }
-
-#define COMMON(x) \
- asm volatile("rep ; stosl" \
- x \
- : "=&c" (d0), "=&D" (d1) \
- : "a" (eax), "0" (count/4), "1" ((long)s) \
- : "memory")
-
- {
- int d0, d1;
-#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
- /* Workaround for broken gcc 4.0 */
- register unsigned long eax asm("%eax") = pattern;
-#else
- unsigned long eax = pattern;
-#endif
-
- switch (count % 4) {
- case 0:
- COMMON("");
- return s;
- case 1:
- COMMON("\n\tstosb");
- return s;
- case 2:
- COMMON("\n\tstosw");
- return s;
- default:
- COMMON("\n\tstosw\n\tstosb");
- return s;
- }
- }
-
-#undef COMMON
-}
-
-#define __constant_c_x_memset(s, c, count) \
- (__builtin_constant_p(count) \
- ? __constant_c_and_count_memset((s), (c), (count)) \
- : __constant_c_memset((s), (c), (count)))
-
#define __memset(s, c, count) \
(__builtin_constant_p(count) \
? __constant_count_memset((s), (c), (count)) \
@@ -321,15 +225,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
#define __HAVE_ARCH_MEMSET
extern void *memset(void *, int, size_t);
#ifndef CONFIG_FORTIFY_SOURCE
-#if (__GNUC__ >= 4)
#define memset(s, c, count) __builtin_memset(s, c, count)
-#else
-#define memset(s, c, count) \
- (__builtin_constant_p(c) \
- ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
- (count)) \
- : __memset((s), (c), (count)))
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMSET16
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 4e4194e21a09..75314c3dbe47 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -14,21 +14,6 @@
extern void *memcpy(void *to, const void *from, size_t len);
extern void *__memcpy(void *to, const void *from, size_t len);
-#ifndef CONFIG_FORTIFY_SOURCE
-#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
-#define memcpy(dst, src, len) \
-({ \
- size_t __len = (len); \
- void *__ret; \
- if (__builtin_constant_p(len) && __len >= 64) \
- __ret = __memcpy((dst), (src), __len); \
- else \
- __ret = __builtin_memcpy((dst), (src), __len); \
- __ret; \
-})
-#endif
-#endif /* !CONFIG_FORTIFY_SOURCE */
-
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
void *__memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 1f86e1b0a5cd..499578f7e6d7 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -23,6 +23,12 @@ struct unwind_state {
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
+ /*
+ * If non-NULL: The current frame is incomplete and doesn't contain a
+ * valid BP. When looking for the next frame, use this instead of the
+ * non-existent saved BP.
+ */
+ unsigned long *next_bp;
struct pt_regs *regs;
#else
unsigned long *sp;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ef05bea7010d..de6f0d59a24f 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -332,15 +332,11 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
return _hypercall4(int, update_va_mapping, va,
new_val.pte, new_val.pte >> 32, flags);
}
-extern int __must_check xen_event_channel_op_compat(int, void *);
static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, event_channel_op, cmd, arg);
- if (unlikely(rc == -ENOSYS))
- rc = xen_event_channel_op_compat(cmd, arg);
- return rc;
+ return _hypercall2(int, event_channel_op, cmd, arg);
}
static inline int
@@ -355,15 +351,10 @@ HYPERVISOR_console_io(int cmd, int count, char *str)
return _hypercall3(int, console_io, cmd, count, str);
}
-extern int __must_check xen_physdev_op_compat(int, void *);
-
static inline int
HYPERVISOR_physdev_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, physdev_op, cmd, arg);
- if (unlikely(rc == -ENOSYS))
- rc = xen_physdev_op_compat(cmd, arg);
- return rc;
+ return _hypercall2(int, physdev_op, cmd, arg);
}
static inline int
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index efe701b7c6ce..59b5ad310f78 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,6 +1,3 @@
-include include/uapi/asm-generic/Kbuild.asm
-
generated-y += unistd_32.h
generated-y += unistd_64.h
generated-y += unistd_x32.h
-generic-y += socket.h
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2624de16cd7a..8dcbf6890714 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -935,6 +935,9 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
#define HPET_RESOURCE_NAME_SIZE 9
hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
SMP_CACHE_BYTES);
+ if (!hpet_res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 58176b56354e..294ed4392a0e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) "AGP: " fmt
#include <linux/kernel.h>
+#include <linux/kcore.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/memblock.h>
@@ -57,7 +58,7 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-#ifdef CONFIG_PROC_VMCORE
+#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE)
/*
* If the first kernel maps the aperture over e820 RAM, the kdump kernel will
* use the same range because it will remain configured in the northbridge.
@@ -66,20 +67,25 @@ int fix_aperture __initdata = 1;
*/
static unsigned long aperture_pfn_start, aperture_page_count;
-static int gart_oldmem_pfn_is_ram(unsigned long pfn)
+static int gart_mem_pfn_is_ram(unsigned long pfn)
{
return likely((pfn < aperture_pfn_start) ||
(pfn >= aperture_pfn_start + aperture_page_count));
}
-static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+static void __init exclude_from_core(u64 aper_base, u32 aper_order)
{
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
- WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram));
+#ifdef CONFIG_PROC_VMCORE
+ WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
+#ifdef CONFIG_PROC_KCORE
+ WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
}
#else
-static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+static void exclude_from_core(u64 aper_base, u32 aper_order)
{
}
#endif
@@ -474,7 +480,7 @@ out:
* may have allocated the range over its e820 RAM
* and fixed up the northbridge
*/
- exclude_from_vmcore(last_aper_base, last_aper_order);
+ exclude_from_core(last_aper_base, last_aper_order);
return 1;
}
@@ -520,7 +526,7 @@ out:
* overlap with the first kernel's memory. We can't access the
* range through vmcore even though it should be part of the dump.
*/
- exclude_from_vmcore(aper_alloc, aper_order);
+ exclude_from_core(aper_alloc, aper_order);
/* Fix up the north bridges */
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 264e3221d923..53aa234a6803 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2581,6 +2581,8 @@ static struct resource * __init ioapic_setup_resources(void)
n *= nr_ioapics;
mem = memblock_alloc(n, SMP_CACHE_BYTES);
+ if (!mem)
+ panic("%s: Failed to allocate %lu bytes\n", __func__, n);
res = (void *)mem;
mem += sizeof(struct resource) * nr_ioapics;
@@ -2625,6 +2627,9 @@ fake_ioapic_page:
#endif
ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE,
PAGE_SIZE);
+ if (!ioapic_phys)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d12226f60168..1d9b8aaea06c 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -124,7 +124,7 @@ static void set_cx86_reorder(void)
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
@@ -135,11 +135,11 @@ static void set_cx86_memwb(void)
pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
/*
@@ -153,14 +153,14 @@ static void geode_configure(void)
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -296,7 +296,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -319,7 +319,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
} else {
/* A 6x86MX - it has the bug. */
set_cpu_bug(c, X86_BUG_COMA);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 89298c83de53..e64de5149e50 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -88,11 +88,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "floating_point", "Floating Point Unit" },
[SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
[SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
[SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
[SMCA_PB] = { "param_block", "Parameter Block" },
[SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
+ [SMCA_SMU_V2] = { "smu", "System Management Unit" },
+ [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "pcie", "PCI Express Unit" },
};
static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
@@ -138,30 +144,42 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF },
{ SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
{ SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
/* Data Fabric MCA types */
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
- { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
/* Platform Security Processor MCA type */
{ SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF },
/* System Management Unit MCA type */
{ SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F },
};
struct smca_bank smca_banks[MAX_NR_BANKS];
@@ -545,6 +563,40 @@ out:
return offset;
}
+/*
+ * Turn off MC4_MISC thresholding banks on all family 0x15 models since
+ * they're not supported there.
+ */
+void disable_err_thresholding(struct cpuinfo_x86 *c)
+{
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ if (c->x86 != 0x15)
+ return;
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -552,6 +604,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
+ disable_err_thresholding(c);
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 1d9b3ce662a0..c038e5c00a59 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -64,11 +64,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
#define CPER_CREATOR_MCE \
- UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
#define CPER_SECTION_TYPE_MCE \
- UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
/*
* CPER specification (in UEFI specification 2.3 appendix N) requires
@@ -135,7 +135,7 @@ retry:
goto out;
/* try to skip other type records in storage */
else if (rc != sizeof(rcd) ||
- uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+ !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE))
goto retry;
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6ce290c506d9..b7fb541a4873 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1612,36 +1612,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 hwcr;
- bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
-
- rdmsrl(MSR_K7_HWCR, hwcr);
-
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
-
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
-
- /* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
- msr_clear_bit(msrs[i], 62);
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dc3e26e905a3..65201e180fe0 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -165,6 +165,11 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 97f9ada9ceda..5260185cbf7b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -608,6 +608,8 @@ static int microcode_reload_late(void)
if (ret > 0)
microcode_check();
+ pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
+
return ret;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e81a2db42df7..3fa238a137d2 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -328,6 +328,18 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
# endif
+
+ /*
+ * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
+ * set x2apic destination mode to physcial mode when x2apic is available
+ * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
+ * have 8-bit APIC id.
+ */
+# ifdef CONFIG_X86_X2APIC
+ if (x2apic_supported())
+ x2apic_phys = 1;
+# endif
+
#endif
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 822b7db634ee..e49b77283924 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/kernfs.h>
+#include <linux/fs_context.h>
#include <linux/jump_label.h>
#define MSR_IA32_L3_QOS_CFG 0xc81
@@ -40,6 +41,21 @@
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+struct rdt_fs_context {
+ struct kernfs_fs_context kfc;
+ bool enable_cdpl2;
+ bool enable_cdpl3;
+ bool enable_mba_mbps;
+};
+
+static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct rdt_fs_context, kfc);
+}
+
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
/**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f33f11f69078..1573a0a6b525 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -501,11 +501,8 @@ out_unlock:
void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
- struct rdt_resource *r;
int cpu;
- r = &rdt_resources_all[RDT_RESOURCE_L3];
-
cpu = cpumask_any(&dom->cpu_mask);
dom->cqm_work_cpu = cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 8388adf241b2..399601eda8e4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -24,6 +24,7 @@
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
+#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
#include <linux/seq_buf.h>
@@ -32,6 +33,7 @@
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/task_work.h>
+#include <linux/user_namespace.h>
#include <uapi/linux/magic.h>
@@ -1858,46 +1860,6 @@ static void cdp_disable_all(void)
cdpl2_disable();
}
-static int parse_rdtgroupfs_options(char *data)
-{
- char *token, *o = data;
- int ret = 0;
-
- while ((token = strsep(&o, ",")) != NULL) {
- if (!*token) {
- ret = -EINVAL;
- goto out;
- }
-
- if (!strcmp(token, "cdp")) {
- ret = cdpl3_enable();
- if (ret)
- goto out;
- } else if (!strcmp(token, "cdpl2")) {
- ret = cdpl2_enable();
- if (ret)
- goto out;
- } else if (!strcmp(token, "mba_MBps")) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- ret = set_mba_sc(true);
- else
- ret = -EINVAL;
- if (ret)
- goto out;
- } else {
- ret = -EINVAL;
- goto out;
- }
- }
-
- return 0;
-
-out:
- pr_err("Invalid mount option \"%s\"\n", token);
-
- return ret;
-}
-
/*
* We don't allow rdtgroup directories to be created anywhere
* except the root directory. Thus when looking for the rdtgroup
@@ -1969,13 +1931,27 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
struct rdtgroup *prgrp,
struct kernfs_node **mon_data_kn);
-static struct dentry *rdt_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+static int rdt_enable_ctx(struct rdt_fs_context *ctx)
+{
+ int ret = 0;
+
+ if (ctx->enable_cdpl2)
+ ret = cdpl2_enable();
+
+ if (!ret && ctx->enable_cdpl3)
+ ret = cdpl3_enable();
+
+ if (!ret && ctx->enable_mba_mbps)
+ ret = set_mba_sc(true);
+
+ return ret;
+}
+
+static int rdt_get_tree(struct fs_context *fc)
{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct rdt_domain *dom;
struct rdt_resource *r;
- struct dentry *dentry;
int ret;
cpus_read_lock();
@@ -1984,53 +1960,42 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
* resctrl file system can only be mounted once.
*/
if (static_branch_unlikely(&rdt_enable_key)) {
- dentry = ERR_PTR(-EBUSY);
+ ret = -EBUSY;
goto out;
}
- ret = parse_rdtgroupfs_options(data);
- if (ret) {
- dentry = ERR_PTR(ret);
+ ret = rdt_enable_ctx(ctx);
+ if (ret < 0)
goto out_cdp;
- }
closid_init();
ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
- if (ret) {
- dentry = ERR_PTR(ret);
- goto out_cdp;
- }
+ if (ret < 0)
+ goto out_mba;
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
NULL, "mon_groups",
&kn_mongrp);
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret < 0)
goto out_info;
- }
kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret < 0)
goto out_mongrp;
- }
kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
ret = rdt_pseudo_lock_init();
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret)
goto out_mondata;
- }
- dentry = kernfs_mount(fs_type, flags, rdt_root,
- RDTGROUP_SUPER_MAGIC, NULL);
- if (IS_ERR(dentry))
+ ret = kernfs_get_tree(fc);
+ if (ret < 0)
goto out_psl;
if (rdt_alloc_capable)
@@ -2059,14 +2024,95 @@ out_mongrp:
kernfs_remove(kn_mongrp);
out_info:
kernfs_remove(kn_info);
+out_mba:
+ if (ctx->enable_mba_mbps)
+ set_mba_sc(false);
out_cdp:
cdp_disable_all();
out:
rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
+ return ret;
+}
+
+enum rdt_param {
+ Opt_cdp,
+ Opt_cdpl2,
+ Opt_mba_mpbs,
+ nr__rdt_params
+};
+
+static const struct fs_parameter_spec rdt_param_specs[] = {
+ fsparam_flag("cdp", Opt_cdp),
+ fsparam_flag("cdpl2", Opt_cdpl2),
+ fsparam_flag("mba_mpbs", Opt_mba_mpbs),
+ {}
+};
+
+static const struct fs_parameter_description rdt_fs_parameters = {
+ .name = "rdt",
+ .specs = rdt_param_specs,
+};
+
+static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- return dentry;
+ switch (opt) {
+ case Opt_cdp:
+ ctx->enable_cdpl3 = true;
+ return 0;
+ case Opt_cdpl2:
+ ctx->enable_cdpl2 = true;
+ return 0;
+ case Opt_mba_mpbs:
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return -EINVAL;
+ ctx->enable_mba_mbps = true;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void rdt_fs_context_free(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations rdt_fs_context_ops = {
+ .free = rdt_fs_context_free,
+ .parse_param = rdt_parse_param,
+ .get_tree = rdt_get_tree,
+};
+
+static int rdt_init_fs_context(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->kfc.root = rdt_root;
+ ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
+ fc->fs_private = &ctx->kfc;
+ fc->ops = &rdt_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(&init_user_ns);
+ fc->global = true;
+ return 0;
}
static int reset_all_ctrls(struct rdt_resource *r)
@@ -2239,9 +2285,10 @@ static void rdt_kill_sb(struct super_block *sb)
}
static struct file_system_type rdt_fs_type = {
- .name = "resctrl",
- .mount = rdt_mount,
- .kill_sb = rdt_kill_sb,
+ .name = "resctrl",
+ .init_fs_context = rdt_init_fs_context,
+ .parameters = &rdt_fs_parameters,
+ .kill_sb = rdt_kill_sb,
};
static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a687d10da417..2879e234e193 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/sort.h>
+#include <linux/memory_hotplug.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
@@ -775,7 +776,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
{
u64 addr;
- addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ addr = memblock_phys_alloc(size, align);
if (addr) {
e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
@@ -878,6 +879,10 @@ static int __init parse_memopt(char *p)
e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ max_mem_size = mem_size;
+#endif
+
return 0;
}
early_param("mem", parse_memopt);
@@ -1092,6 +1097,9 @@ void __init e820__reserve_resources(void)
res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
SMP_CACHE_BYTES);
+ if (!res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(*res) * e820_table->nr_entries);
e820_res = res;
for (i = 0; i < e820_table->nr_entries; i++) {
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3e3789c8f8e1..ef49517f6bb2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -49,7 +49,7 @@ int ftrace_arch_code_modify_post_process(void)
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- unsigned char e8;
+ unsigned char op;
int offset;
} __attribute__((packed));
};
@@ -59,20 +59,23 @@ static int ftrace_calc_offset(long ip, long addr)
return (int)(addr - ip);
}
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *
+ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
- calc.e8 = 0xe8;
+ calc.op = op;
calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
- /*
- * No locking needed, this must be called via kstop_machine
- * which in essence is like running on a uniprocessor machine.
- */
return calc.code;
}
+static unsigned char *
+ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe8, ip, addr);
+}
+
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -665,22 +668,6 @@ int __init ftrace_dyn_arch_init(void)
return 0;
}
-#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
-{
- static union ftrace_code_union calc;
-
- /* Jmp not a call (ignore the .e8) */
- calc.e8 = 0xe9;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
-
- /*
- * ftrace external locks synchronize the access to the static variable.
- */
- return calc.code;
-}
-#endif
-
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
@@ -892,8 +879,8 @@ static void *addr_from_call(void *ptr)
return NULL;
/* Make sure this is a call */
- if (WARN_ON_ONCE(calc.e8 != 0xe8)) {
- pr_warn("Expected e8, got %x\n", calc.e8);
+ if (WARN_ON_ONCE(calc.op != 0xe8)) {
+ pr_warn("Expected e8, got %x\n", calc.op);
return NULL;
}
@@ -964,6 +951,11 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe9, ip, addr);
+}
+
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dfd3aca82c61..fb32925a2e62 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -905,6 +905,8 @@ int __init hpet_enable(void)
return 0;
hpet_set_mapping();
+ if (!hpet_virt_address)
+ return 0;
/*
* Read the period and check for a sane value:
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff9bfd40429e..d73083021002 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -354,6 +354,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
#endif
default:
WARN_ON_ONCE(1);
+ return -EINVAL;
}
/*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 1f3b77367948..22f60dd26460 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -538,9 +538,17 @@ static int bzImage64_cleanup(void *loader_data)
#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
{
- return verify_pefile_signature(kernel, kernel_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_KEXEC_PE_SIGNATURE);
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
}
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e811d4d1c824..904494b924c1 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void)
static inline void kvm_sched_clock_init(bool stable)
{
- if (!stable) {
- pv_ops.time.sched_clock = kvm_clock_read;
+ if (!stable)
clear_sched_clock_stable();
- return;
- }
-
kvm_sched_clock_offset = kvm_clock_read();
pv_ops.time.sched_clock = kvm_sched_clock_read;
@@ -355,6 +351,20 @@ void __init kvmclock_init(void)
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
+
+ /*
+ * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * Invariant TSC exposed by host means kvmclock is not necessary:
+ * can use TSC as clocksource.
+ *
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ !check_tsc_unstable())
+ kvm_clock.rating = 299;
+
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3482460d984d..1bfe5c6e6cfe 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -598,8 +598,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf_base = base;
mpf_found = true;
- pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
- base, base + sizeof(*mpf) - 1, mpf);
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n",
+ base, base + sizeof(*mpf) - 1);
memblock_reserve(base, sizeof(*mpf));
if (mpf->physptr)
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 13af08827eef..4bf46575568a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,22 +106,22 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
void *ptr;
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = memblock_alloc_from_nopanic(size, align, goal);
+ ptr = memblock_alloc_from(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
cpu, size, __pa(ptr));
} else {
- ptr = memblock_alloc_try_nid_nopanic(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- node);
+ ptr = memblock_alloc_try_nid(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ node);
pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
cpu, size, node, __pa(ptr));
}
return ptr;
#else
- return memblock_alloc_from_nopanic(size, align, goal);
+ return memblock_alloc_from(size, align, goal);
#endif
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 3dc26f95d46e..9b9fd4826e7a 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
}
/* Get the next frame pointer: */
- if (state->regs)
+ if (state->next_bp) {
+ next_bp = state->next_bp;
+ state->next_bp = NULL;
+ } else if (state->regs) {
next_bp = (unsigned long *)state->regs->bp;
- else
+ } else {
next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ }
/* Move to the next frame if it's safe: */
if (!update_stack_state(state, next_bp))
@@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
bp = get_frame_pointer(task, regs);
+ /*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer.
+ * That means that SP points into the middle of an incomplete frame:
+ * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+ * would have written a frame pointer if we hadn't crashed.
+ * Pretend that the frame is complete and that BP points to it, but save
+ * the real BP so that we can use it when looking for the next frame.
+ */
+ if (regs && regs->ip == 0 &&
+ (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
+ state->next_bp = bp;
+ bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
+ }
+
/* Initialize stack info and make sure the frame data is accessible: */
get_stack_info(bp, state->task, &state->stack_info,
&state->stack_mask);
@@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
*/
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->bp < first_frame))
+ (state->next_bp == NULL && state->bp < first_frame)))
unwind_next_frame(state);
}
EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 26038eacf74a..89be1be1790c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
}
#endif
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+ .sp_offset = sizeof(long),
+ .sp_reg = ORC_REG_SP,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = ORC_TYPE_CALL
+};
+
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
@@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip)
if (!orc_init)
return NULL;
+ if (ip == 0)
+ return &null_orc_entry;
+
/* For non-init vmlinux addresses, use the fast lookup table: */
if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
unsigned int idx, start, stop;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c07958b59f50..fd3951638ae4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -405,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE);
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 89d20ed1d2e8..421899f6ad7b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
new_config.enable = 0;
stimer->config.as_uint64 = new_config.as_uint64;
- stimer_mark_pending(stimer, false);
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
stimer->config.enable = 0;
else if (stimer->config.auto_enable)
stimer->config.enable = 1;
- stimer_mark_pending(stimer, false);
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -1729,7 +1734,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
mutex_lock(&hv->hv_lock);
ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
mutex_unlock(&hv->hv_lock);
if (ret >= 0)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895b1fc..4a6dc54cc12b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_t pid_nr;
int ret;
- pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
if (!pit)
return NULL;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bdcd4139eca9..8b38bb4868a6 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
struct kvm_pic *s;
int ret;
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
if (!s)
return -ENOMEM;
spin_lock_init(&s->lock);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 4e822ad363f3..1add1bc881e2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
struct kvm_ioapic *ioapic;
int ret;
- ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
if (!ioapic)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4b6c2da7265c..991fdf7fc17f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvzalloc(sizeof(struct kvm_apic_map) +
- sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+ GFP_KERNEL_ACCOUNT);
if (!new)
goto out;
@@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
ASSERT(vcpu != NULL);
apic_debug("apic_init %d\n", vcpu->vcpu_id);
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
vcpu->arch.apic = apic;
- apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2d1d230d5b8..eee455a8a612 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -109,9 +109,11 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
#define PT64_LVL_ADDR_MASK(level) \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT64_LEVEL_BITS))) - 1))
@@ -180,7 +182,7 @@ struct kvm_shadow_walk_iterator {
static const union kvm_mmu_page_role mmu_base_role_mask = {
.cr0_wp = 1,
- .cr4_pae = 1,
+ .gpte_is_8_bytes = 1,
.nxe = 1,
.smep_andnot_wp = 1,
.smap_andnot_wp = 1,
@@ -330,53 +332,56 @@ static inline bool is_access_track_spte(u64 spte)
}
/*
- * the low bit of the generation number is always presumed to be zero.
- * This disables mmio caching during memslot updates. The concept is
- * similar to a seqcount but instead of retrying the access we just punt
- * and ignore the cache.
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
*
- * spte bits 3-11 are used as bits 1-9 of the generation number,
- * the bits 52-61 are used as bits 10-19 of the generation number.
+ * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
+ * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
+ *
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap). The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 2
-#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
-#define MMIO_GEN_SHIFT 20
-#define MMIO_GEN_LOW_SHIFT 10
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
-#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_SPTE_GEN_LOW_START 3
+#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
-static u64 generation_mmio_spte_mask(unsigned int gen)
+#define MMIO_SPTE_GEN_HIGH_START 52
+#define MMIO_SPTE_GEN_HIGH_END 61
+#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+ MMIO_SPTE_GEN_HIGH_START)
+static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
- WARN_ON(gen & ~MMIO_GEN_MASK);
+ WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
- mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
-static unsigned int get_mmio_spte_generation(u64 spte)
+static u64 get_mmio_spte_generation(u64 spte)
{
- unsigned int gen;
+ u64 gen;
spte &= ~shadow_mmio_mask;
- gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
- gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
return gen;
}
-static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned access)
{
- unsigned int gen = kvm_current_mmio_generation(vcpu);
+ u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
u64 mask = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
@@ -386,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
+ page_header(__pa(sptep))->mmio_cached = true;
+
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
}
@@ -407,7 +414,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
static unsigned get_mmio_spte_access(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) & ~PAGE_MASK;
}
@@ -424,9 +431,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
- unsigned int kvm_gen, spte_gen;
+ u64 kvm_gen, spte_gen, gen;
+
+ gen = kvm_vcpu_memslots(vcpu)->generation;
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return false;
- kvm_gen = kvm_current_mmio_generation(vcpu);
+ kvm_gen = gen & MMIO_SPTE_GEN_MASK;
spte_gen = get_mmio_spte_generation(spte);
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -959,7 +970,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
if (!obj)
return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -2049,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-
- /*
- * The active_mmu_pages list is the FIFO list, do not move the
- * page until it is zapped. kvm_zap_obsolete_pages depends on
- * this feature. See the comments in kvm_zap_obsolete_pages().
- */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2195,35 +2200,33 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list);
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-/*
- * NOTE: we should pay more attention on the zapped-obsolete page
- * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
- * since it has been deleted from active_mmu_pages but still can be found
- * at hast list.
- *
- * for_each_valid_sp() has skipped that kind of pages.
- */
+
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ if ((_sp)->role.invalid) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
for_each_valid_sp(_kvm, _sp, _gfn) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+static inline bool is_ept_sp(struct kvm_mmu_page *sp)
+{
+ return sp->role.cr0_wp && sp->role.smap_andnot_wp;
+}
+
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)
- || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
+ vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2231,18 +2234,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return true;
}
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+ struct list_head *invalid_list,
+ bool remote_flush)
+{
+ if (!remote_flush && !list_empty(invalid_list))
+ return false;
+
+ if (!list_empty(invalid_list))
+ kvm_mmu_commit_zap_page(kvm, invalid_list);
+ else
+ kvm_flush_remote_tlbs(kvm);
+ return true;
+}
+
static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
struct list_head *invalid_list,
bool remote_flush, bool local_flush)
{
- if (!list_empty(invalid_list)) {
- kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
return;
- }
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
+ if (local_flush)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
@@ -2253,11 +2266,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
-static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2421,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
if (role.direct)
- role.cr4_pae = 0;
+ role.gpte_is_8_bytes = true;
role.access = access;
if (!vcpu->arch.mmu->direct_map
&& vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
@@ -2482,7 +2490,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
@@ -2668,17 +2675,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list,
+ int *nr_zapped)
{
- int ret;
+ bool list_unstable;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+ /* Zapping children means active_mmu_pages has become unstable. */
+ list_unstable = *nr_zapped;
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp);
@@ -2686,22 +2698,27 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
/* Count self */
- ret++;
+ (*nr_zapped)++;
list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- /*
- * The obsolete pages can not be used on any vcpus.
- * See the comments in kvm_mmu_invalidate_zap_all_pages().
- */
- if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ if (!sp->role.invalid)
kvm_reload_remote_mmus(kvm);
}
sp->role.invalid = 1;
- return ret;
+ return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int nr_zapped;
+
+ __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+ return nr_zapped;
}
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -3703,7 +3720,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
u64 *lm_root;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (lm_root == NULL)
return 1;
@@ -4204,14 +4221,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false;
if (cached_root_available(vcpu, new_cr3, new_role)) {
- /*
- * It is possible that the cached previous root page is
- * obsolete because of a change in the MMU
- * generation number. However, that is accompanied by
- * KVM_REQ_MMU_RELOAD, which will free the root that we
- * have set here and allocate a new one.
- */
-
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -4791,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
role.base.access = ACC_ALL;
role.base.nxe = !!is_nx(vcpu);
- role.base.cr4_pae = !!is_pae(vcpu);
role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
@@ -4812,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
role.base.direct = true;
+ role.base.gpte_is_8_bytes = true;
return role;
}
@@ -4876,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
role.base.direct = !is_paging(vcpu);
+ role.base.gpte_is_8_bytes = !!is_pae(vcpu);
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
@@ -4915,18 +4925,26 @@ static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly)
{
- union kvm_mmu_role role;
+ union kvm_mmu_role role = {0};
- /* Base role is inherited from root_mmu */
- role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* SMM flag is inherited from root_mmu */
+ role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = PT64_ROOT_4LEVEL;
+ role.base.gpte_is_8_bytes = true;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
+ /*
+ * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
+ * SMAP variation to denote shadow EPT entries.
+ */
+ role.base.cr0_wp = true;
+ role.base.smap_andnot_wp = true;
+
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
role.ext.execonly = execonly;
return role;
@@ -5176,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.cr4_pae ? 8 : 4;
+ pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5200,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.cr4_pae) {
+ if (!sp->role.gpte_is_8_bytes) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5390,10 +5408,12 @@ emulate:
* This can happen if a guest gets a page-fault on data access but the HW
* table walker is not able to read the instruction page (e.g instruction
* page is not present in memory). In those cases we simply restart the
- * guest.
+ * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
*/
- if (unlikely(insn && !insn_len))
- return 1;
+ if (unlikely(insn && !insn_len)) {
+ if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
+ return 1;
+ }
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
@@ -5486,6 +5506,79 @@ void kvm_disable_tdp(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+
+/* The caller should hold mmu-lock before calling this function. */
+static __always_inline bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+{
+ struct slot_rmap_walk_iterator iterator;
+ bool flush = false;
+
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs_with_address(kvm,
+ start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
+ end_gfn - start_gfn + 1);
+ flush = false;
+ }
+
+ return flush;
+}
+
+static __always_inline bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu->pae_root);
@@ -5505,7 +5598,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
* Therefore we need to allocate shadow page tables in the first
* 4GB of memory, which happens to fit the DMA32 zone.
*/
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
if (!page)
return -ENOMEM;
@@ -5543,105 +5636,62 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
-}
-
-void kvm_mmu_init_vm(struct kvm *kvm)
-{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-
- node->track_write = kvm_mmu_pte_write;
- node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
- kvm_page_track_register_notifier(kvm, node);
-}
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+ bool flush;
+ gfn_t gfn;
-void kvm_mmu_uninit_vm(struct kvm *kvm)
-{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ spin_lock(&kvm->mmu_lock);
- kvm_page_track_unregister_notifier(kvm, node);
-}
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_unlock;
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+ flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
-{
- struct slot_rmap_walk_iterator iterator;
- bool flush = false;
+ for (i = 0; i < slot->npages; i++) {
+ gfn = slot->base_gfn + i;
- for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap);
+ for_each_valid_sp(kvm, sp, gfn) {
+ if (sp->gfn != gfn)
+ continue;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
-
- return flush;
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
}
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+void kvm_mmu_init_vm(struct kvm *kvm)
{
- return slot_handle_level_range(kvm, memslot, fn, start_level,
- end_level, memslot->base_gfn,
- memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
-}
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-static __always_inline bool
-slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ node->track_write = kvm_mmu_pte_write;
+ node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
+ kvm_page_track_register_notifier(kvm, node);
}
-static __always_inline bool
-slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+void kvm_mmu_uninit_vm(struct kvm *kvm)
{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+ kvm_page_track_unregister_notifier(kvm, node);
}
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- bool flush_tlb = true;
- bool flush = false;
int i;
- if (kvm_available_flush_tlb_with_range())
- flush_tlb = false;
-
spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
@@ -5653,17 +5703,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- flush |= slot_handle_level_range(kvm, memslot,
- kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, start,
- end - 1, flush_tlb);
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
}
}
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start + 1);
-
spin_unlock(&kvm->mmu_lock);
}
@@ -5815,101 +5860,58 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
-#define BATCH_ZAP_PAGES 10
-static void kvm_zap_obsolete_pages(struct kvm *kvm)
+static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
{
struct kvm_mmu_page *sp, *node;
- int batch = 0;
+ LIST_HEAD(invalid_list);
+ int ign;
+ spin_lock(&kvm->mmu_lock);
restart:
- list_for_each_entry_safe_reverse(sp, node,
- &kvm->arch.active_mmu_pages, link) {
- int ret;
-
- /*
- * No obsolete page exists before new created page since
- * active_mmu_pages is the FIFO list.
- */
- if (!is_obsolete_sp(kvm, sp))
- break;
-
- /*
- * Since we are reversely walking the list and the invalid
- * list will be moved to the head, skip the invalid page
- * can help us to avoid the infinity list walking.
- */
- if (sp->role.invalid)
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (mmio_only && !sp->mmio_cached)
continue;
-
- /*
- * Need not flush tlb since we only zap the sp with invalid
- * generation number.
- */
- if (batch >= BATCH_ZAP_PAGES &&
- cond_resched_lock(&kvm->mmu_lock)) {
- batch = 0;
+ if (sp->role.invalid && sp->root_count)
+ continue;
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
+ WARN_ON_ONCE(mmio_only);
goto restart;
}
-
- ret = kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages);
- batch += ret;
-
- if (ret)
+ if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
- /*
- * Should flush tlb before free page tables since lockless-walking
- * may use the pages.
- */
- kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
-}
-
-/*
- * Fast invalidate all shadow pages and use lock-break technique
- * to zap obsolete pages.
- *
- * It's required when memslot is being deleted or VM is being
- * destroyed, in these cases, we should ensure that KVM MMU does
- * not use any resource of the being-deleted slot or all slots
- * after calling the function.
- */
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
-{
- spin_lock(&kvm->mmu_lock);
- trace_kvm_mmu_invalidate_zap_all_pages(kvm);
- kvm->arch.mmu_valid_gen++;
-
- /*
- * Notify all vcpus to reload its shadow page table
- * and flush TLB. Then all vcpus will switch to new
- * shadow page table with the new mmu_valid_gen.
- *
- * Note: we should do this under the protection of
- * mmu-lock, otherwise, vcpu would purge shadow page
- * but miss tlb flush.
- */
- kvm_reload_remote_mmus(kvm);
-
- kvm_zap_obsolete_pages(kvm);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
}
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+void kvm_mmu_zap_all(struct kvm *kvm)
{
- return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+ return __kvm_mmu_zap_all(kvm, false);
}
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+
+ gen &= MMIO_SPTE_GEN_MASK;
+
/*
- * The very rare case: if the generation-number is round,
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
* zap all shadow pages.
*/
- if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
+ if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ __kvm_mmu_zap_all(kvm, true);
}
}
@@ -5940,24 +5942,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
- if (!kvm->arch.n_used_mmu_pages &&
- !kvm_has_zapped_obsolete_pages(kvm))
+ if (!kvm->arch.n_used_mmu_pages)
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- if (kvm_has_zapped_obsolete_pages(kvm)) {
- kvm_mmu_commit_zap_page(kvm,
- &kvm->arch.zapped_obsolete_pages);
- goto unlock;
- }
-
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
@@ -6037,7 +6031,7 @@ out:
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7b333147c4a..bbdc60f2fae8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c73bf4e4988c..dd30dccd2ad5 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,18 +8,16 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define KVM_MMU_PAGE_FIELDS \
- __field(unsigned long, mmu_valid_gen) \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
__field(bool, unsync)
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->mmu_valid_gen = sp->mmu_valid_gen; \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -31,11 +29,10 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
- __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
+ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
@@ -283,27 +280,6 @@ TRACE_EVENT(
);
TRACE_EVENT(
- kvm_mmu_invalidate_zap_all_pages,
- TP_PROTO(struct kvm *kvm),
- TP_ARGS(kvm),
-
- TP_STRUCT__entry(
- __field(unsigned long, mmu_valid_gen)
- __field(unsigned int, mmu_used_pages)
- ),
-
- TP_fast_assign(
- __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
- __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
- ),
-
- TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
- __entry->mmu_valid_gen, __entry->mmu_used_pages
- )
-);
-
-
-TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
TP_ARGS(spte, kvm_gen, spte_gen),
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a3065..fd04d462fdae 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
slot->arch.gfn_track[i] =
kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f13a3a24d360..426039285fd1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -145,7 +145,6 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
- u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
struct hlist_node hnode;
@@ -236,6 +235,7 @@ struct vcpu_svm {
bool nrips_enabled : 1;
u32 ldr_reg;
+ u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
@@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
- pages = vmalloc(size);
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
else
- pages = kmalloc(size, GFP_KERNEL);
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
return NULL;
@@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
static struct kvm *svm_vm_alloc(void)
{
- struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
+ struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
return &kvm_svm->kvm;
}
@@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm)
return 0;
/* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL);
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!p_page)
goto free_avic;
@@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL);
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!l_page)
goto free_avic;
@@ -2106,6 +2109,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
return ret;
}
@@ -2119,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
struct page *nested_msrpm_pages;
int err;
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!svm) {
err = -ENOMEM;
goto out;
}
- svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+ svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
if (!svm->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -2137,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_svm;
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
goto uninit;
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
goto free_page1;
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
goto free_page2;
- hsave_page = alloc_page(GFP_KERNEL);
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!hsave_page)
goto free_page3;
@@ -4565,8 +4570,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
- bool valid)
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -4579,31 +4583,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
- if (valid)
- new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
- else
- new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
return 0;
}
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+
+ if (entry)
+ WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK);
+}
+
static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret;
+ int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
- if (!ldr)
- return 1;
+ if (ldr == svm->ldr_reg)
+ return 0;
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
- if (ret && svm->ldr_reg) {
- avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
- svm->ldr_reg = 0;
- } else {
+ avic_invalidate_logical_id_entry(vcpu);
+
+ if (ldr)
+ ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
+
+ if (!ret)
svm->ldr_reg = ldr;
- }
+
return ret;
}
@@ -4637,27 +4649,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
return 0;
}
-static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
- u32 mod = (dfr >> 28) & 0xf;
-
- /*
- * We assume that all local APICs are using the same type.
- * If this changes, we need to flush the AVIC logical
- * APID id table.
- */
- if (kvm_svm->ldr_mode == mod)
- return 0;
- clear_page(page_address(kvm_svm->avic_logical_id_table_page));
- kvm_svm->ldr_mode = mod;
+ if (svm->dfr_reg == dfr)
+ return;
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
- return 0;
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
}
static int avic_unaccel_trap_write(struct vcpu_svm *svm)
@@ -5125,11 +5126,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
- return;
-
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- mark_dirty(vmcb, VMCB_INTR);
+ if (kvm_vcpu_apicv_active(vcpu))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_AVIC);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5195,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -6163,8 +6164,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
- if (avic_handle_dfr_update(vcpu) != 0)
- return;
+ avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -6311,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
if (ret)
return ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6361,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL);
+ start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
if (!start)
return -ENOMEM;
@@ -6458,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6535,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6597,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6618,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6646,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
struct sev_data_dbg *data;
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6901,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
goto e_unpin_memory;
@@ -7007,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm,
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
- region = kzalloc(sizeof(*region), GFP_KERNEL);
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;
@@ -7098,6 +7098,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
}
+static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ bool is_user, smap;
+
+ is_user = svm_get_cpl(vcpu) == 3;
+ smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh
+ *
+ * In non SEV guest, hypervisor will be able to read the guest
+ * memory to decode the instruction pointer when insn_len is zero
+ * so we return true to indicate that decoding is possible.
+ *
+ * But in the SEV guest, the guest memory is encrypted with the
+ * guest specific key and hypervisor will not be able to decode the
+ * instruction pointer so we will not able to workaround it. Lets
+ * print the error and request to kill the guest.
+ */
+ if (is_user && smap) {
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7231,6 +7261,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.nested_enable_evmcs = nested_enable_evmcs,
.nested_get_evmcs_version = nested_get_evmcs_version,
+
+ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d737a51a53ca..153e539c29c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
- hrtimer_cancel(&vmx->nested.preemption_timer);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
+ vmx_leave_nested(vcpu);
vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
free_nested(vcpu);
vcpu_put(vcpu);
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
prepare_vmcs02_early_full(vmx, vmcs12);
/*
- * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
- * entry, but only if the current (host) sp changed from the value
- * we wrote last (vmx->host_rsp). This cache is no longer relevant
- * if we switch vmcs, and rather than hold a separate cache per vmcs,
- * here we just force the write to happen on entry. host_rsp will
- * also be written unconditionally by nested_vmx_check_vmentry_hw()
- * if we are doing early consistency checks via hardware.
- */
- vmx->host_rsp = 0;
-
- /*
* PIN CONTROLS
*/
exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
- vmx->nested.preemption_timer_expired = false;
- if (nested_cpu_has_preemption_timer(vmcs12))
- vmx_start_preemption_timer(vcpu);
-
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
@@ -2600,6 +2585,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return -EINVAL;
+
+ if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
+ is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2722,6 +2712,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
+ bool vm_fail;
if (!nested_early_check)
return 0;
@@ -2755,29 +2746,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- vmx->__launched = vmx->loaded_vmcs->launched;
-
asm(
- /* Set HOST_RSP */
"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
- "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"
+ "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+ "je 1f \n\t"
+ __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
+ "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+ "1: \n\t"
"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
/* Check if vmlaunch or vmresume is needed */
- "cmpl $0, %c[launched](%% " _ASM_CX")\n\t"
+ "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
+ /*
+ * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
+ * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
+ * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
+ * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
+ */
"call vmx_vmenter\n\t"
- /* Set vmx->fail accordingly */
- "setbe %c[fail](%% " _ASM_CX")\n\t"
- : ASM_CALL_CONSTRAINT
- : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ CC_SET(be)
+ : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
+ : [HOST_RSP]"r"((unsigned long)HOST_RSP),
+ [loaded_vmcs]"r"(vmx->loaded_vmcs),
+ [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+ [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
[wordsize]"i"(sizeof(ulong))
- : "rax", "cc", "memory"
+ : "cc", "memory"
);
preempt_enable();
@@ -2787,10 +2783,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
- if (vmx->fail) {
+ if (vm_fail) {
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- vmx->fail = 0;
return 1;
}
@@ -2813,8 +2808,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
-STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
-
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3031,6 +3024,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
+ * Do not start the preemption timer hrtimer until after we know
+ * we are successful, so that only nested_vmx_vmexit needs to cancel
+ * the timer.
+ */
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
+
+ /*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
@@ -3450,13 +3452,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
- if (nested_cpu_has_preemption_timer(vmcs12)) {
- if (vmcs12->vm_exit_controls &
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
vmcs12->vmx_preemption_timer_value =
vmx_get_preemption_timer_value(vcpu);
- hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- }
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3863,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
@@ -3915,9 +3917,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_flush_tlb(vcpu, true);
}
- /* This is needed for same reason as it was needed in prepare_vmcs02 */
- vmx->host_rsp = 0;
-
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4034,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Addr = segment_base + offset */
/* offset = base + [index * scale] + displacement */
off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
if (base_is_valid)
off += kvm_register_read(vcpu, base_reg);
if (index_is_valid)
off += kvm_register_read(vcpu, index_reg)<<scaling;
vmx_get_segment(vcpu, &s, seg_reg);
- *ret = s.base + off;
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
if (addr_size == 1) /* 32 bit */
- *ret &= 0xffffffff;
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
/* Checks for #GP/#SS exceptions. */
exn = false;
if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ *ret = s.base + off;
+
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
exn = is_noncanonical_address(*ret, vcpu);
- } else if (is_protmode(vcpu)) {
+ } else {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
/* Protected mode: apply checks for segment validity in the
* following order:
* - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4101,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
*/
exn = (s.unusable != 0);
- /* Protected mode: #GP(0)/#SS(0) if the memory
- * operand is outside the segment limit.
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
*/
- exn = exn || (off + sizeof(u64) > s.limit);
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || (off + sizeof(u64) > s.limit);
}
if (exn) {
kvm_queue_exception_e(vcpu,
@@ -4145,11 +4175,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;
- vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
- vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5696,6 +5726,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {
for (i = 0; i < VMX_BITMAP_NR; i++) {
+ /*
+ * The vmx_bitmap is not tied to a VM and so should
+ * not be charged to a memcg.
+ */
vmx_bitmap[i] = (unsigned long *)
__get_free_page(GFP_KERNEL);
if (!vmx_bitmap[i]) {
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6def3ba88e3b..cb6079f8a227 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -34,6 +34,7 @@ struct vmcs_host_state {
unsigned long cr4; /* May not match real cr4 */
unsigned long gs_base;
unsigned long fs_base;
+ unsigned long rsp;
u16 fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bcef2c7e9bc4..7b272738c576 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,6 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
+#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
+#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
+#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#endif
.text
@@ -55,3 +79,146 @@ ENDPROC(vmx_vmenter)
ENTRY(vmx_vmexit)
ret
ENDPROC(vmx_vmexit)
+
+/**
+ * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+ * @vmx: struct vcpu_vmx *
+ * @regs: unsigned long * (to guest registers)
+ * @launched: %true if the VMCS has been launched
+ *
+ * Returns:
+ * 0 on VM-Exit, 1 on VM-Fail
+ */
+ENTRY(__vmx_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
+ * @regs is needed after VM-Exit to save the guest's register values.
+ */
+ push %_ASM_ARG2
+
+ /* Copy @launched to BL, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3B, %bl
+
+ /* Adjust RSP to account for the CALL to vmx_vmenter(). */
+ lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
+ call vmx_update_host_rsp
+
+ /* Load @regs to RAX. */
+ mov (%_ASM_SP), %_ASM_AX
+
+ /* Check if vmlaunch or vmresume is needed */
+ cmpb $0, %bl
+
+ /* Load guest registers. Don't clobber flags. */
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+ mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+ mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_AX), %r8
+ mov VCPU_R9 (%_ASM_AX), %r9
+ mov VCPU_R10(%_ASM_AX), %r10
+ mov VCPU_R11(%_ASM_AX), %r11
+ mov VCPU_R12(%_ASM_AX), %r12
+ mov VCPU_R13(%_ASM_AX), %r13
+ mov VCPU_R14(%_ASM_AX), %r14
+ mov VCPU_R15(%_ASM_AX), %r15
+#endif
+ /* Load guest RAX. This kills the vmx_vcpu pointer! */
+ mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+
+ /* Enter guest mode */
+ call vmx_vmenter
+
+ /* Jump on VM-Fail. */
+ jbe 2f
+
+ /* Temporarily save guest's RAX. */
+ push %_ASM_AX
+
+ /* Reload @regs to RAX. */
+ mov WORD_SIZE(%_ASM_SP), %_ASM_AX
+
+ /* Save all guest registers, including RAX from the stack */
+ __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
+ xor %eax, %eax
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as RSP is restored by hardware during
+ * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
+ */
+1: xor %ebx, %ebx
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %esi, %esi
+ xor %edi, %edi
+ xor %ebp, %ebp
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /* "POP" @regs. */
+ add $WORD_SIZE, %_ASM_SP
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ ret
+
+ /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
+2: mov $1, %eax
+ jmp 1b
+ENDPROC(__vmx_vcpu_run)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30a6bcd735ec..ab432a930ae8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ /*
+ * This allocation for vmx_l1d_flush_pages is not tied to a VM
+ * lifetime and so should not be charged to a memcg.
+ */
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
@@ -1679,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -1891,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -2387,13 +2380,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2440,7 +2433,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
- loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ loaded_vmcs->msr_bitmap = (unsigned long *)
+ __get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
@@ -2481,7 +2475,7 @@ static __init int alloc_kvm_area(void)
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
- vmcs = alloc_vmcs_cpu(false, cpu);
+ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
@@ -4083,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -6360,150 +6352,15 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->hv_timer_armed = false;
}
-static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
- unsigned long evmcs_rsp;
-
- vmx->__launched = vmx->loaded_vmcs->launched;
-
- evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
- (unsigned long)&current_evmcs->host_rsp : 0;
-
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
-
- asm(
- /* Store host registers */
- "push %%" _ASM_DX "; push %%" _ASM_BP ";"
- "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
- "push %%" _ASM_CX " \n\t"
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- "je 1f \n\t"
- "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- /* Avoid VMWRITE when Enlightened VMCS is in use */
- "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "jz 2f \n\t"
- "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
- "jmp 1f \n\t"
- "2: \n\t"
- __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Reload cr2 if changed */
- "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %%cr2, %%" _ASM_DX " \n\t"
- "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 3f \n\t"
- "mov %%" _ASM_AX", %%cr2 \n\t"
- "3: \n\t"
- /* Check if vmlaunch or vmresume is needed */
- "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
- /* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
- "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
- "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
- "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
- "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
- "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
- "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
- "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
- "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
- "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
- "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
- "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
-#endif
- /* Load guest RCX. This kills the vmx_vcpu pointer! */
- "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
-
- /* Enter guest mode */
- "call vmx_vmenter\n\t"
-
- /* Save guest's RCX to the stack placeholder (see above) */
- "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
-
- /* Load host's RCX, i.e. the vmx_vcpu pointer */
- "pop %%" _ASM_CX " \n\t"
-
- /* Set vmx->fail based on EFLAGS.{CF,ZF} */
- "setbe %c[fail](%%" _ASM_CX ")\n\t"
-
- /* Save all guest registers, including RCX from the stack */
- "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
- __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
- "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
- "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
- "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
- "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
- "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
- "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
- "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
- /*
- * Clear host registers marked as clobbered to prevent
- * speculative use.
- */
- "xor %%r8d, %%r8d \n\t"
- "xor %%r9d, %%r9d \n\t"
- "xor %%r10d, %%r10d \n\t"
- "xor %%r11d, %%r11d \n\t"
- "xor %%r12d, %%r12d \n\t"
- "xor %%r13d, %%r13d \n\t"
- "xor %%r14d, %%r14d \n\t"
- "xor %%r15d, %%r15d \n\t"
-#endif
- "mov %%cr2, %%" _ASM_AX " \n\t"
- "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
-
- "xor %%eax, %%eax \n\t"
- "xor %%ebx, %%ebx \n\t"
- "xor %%esi, %%esi \n\t"
- "xor %%edi, %%edi \n\t"
- "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
- : ASM_CALL_CONSTRAINT
- : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
- [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
- [wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
-#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi"
- , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
- , "eax", "ebx", "edi"
-#endif
- );
+ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ vmcs_writel(HOST_RSP, host_rsp);
+ }
}
-STACK_FRAME_NON_STANDARD(__vmx_vcpu_run);
+
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -6572,7 +6429,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- __vmx_vcpu_run(vcpu, vmx);
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+
+ if (vcpu->arch.cr2 != read_cr2())
+ write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
+
+ vcpu->arch.cr2 = read_cr2();
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -6657,7 +6523,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
static struct kvm *vmx_vm_alloc(void)
{
- struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
+ struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
return &kvm_vmx->kvm;
}
@@ -6673,7 +6541,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
if (enable_pml)
vmx_destroy_pml_buffer(vmx);
free_vpid(vmx->vpid);
- leave_guest_mode(vcpu);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
@@ -6685,14 +6552,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
int cpu;
+ vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vmx)
return ERR_PTR(-ENOMEM);
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+ vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
if (!vmx->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -6714,12 +6583,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
- vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
goto uninit_vcpu;
}
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
> PAGE_SIZE);
@@ -7527,6 +7396,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
@@ -7829,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 0ac0a64c7790..a1e00d0a2482 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -175,7 +175,6 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
- unsigned long host_rsp;
u8 fail;
u8 msr_bitmap_mode;
u32 exit_intr_info;
@@ -191,7 +190,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
@@ -209,7 +207,7 @@ struct vcpu_vmx {
struct loaded_vmcs vmcs01;
struct loaded_vmcs *loaded_vmcs;
struct loaded_vmcs *loaded_cpu_state;
- bool __launched; /* temporary, used in vmx_vcpu_run */
+
struct msr_autoload {
struct vmx_msrs guest;
struct vmx_msrs host;
@@ -339,8 +337,8 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
- return set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
}
static inline void pi_set_on(struct pi_desc *pi_desc)
@@ -445,7 +443,8 @@ static inline u32 vmx_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
- vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmentry_ctrl &
~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
@@ -455,9 +454,10 @@ static inline u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
- vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmcs_config.vmexit_ctrl &
+ return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
@@ -478,7 +478,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu);
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -487,7 +487,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
{
- return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
+ GFP_KERNEL_ACCOUNT);
}
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 941f932373d0..099b851dabaf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1125,7 +1125,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1158,6 +1158,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -2443,6 +2444,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -2747,6 +2753,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
@@ -3879,7 +3891,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+ GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.lapic)
@@ -4066,7 +4079,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -4090,7 +4103,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -6522,14 +6535,27 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
+
+ if (!ret) {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
return ret;
}
@@ -6540,6 +6566,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
: 0;
@@ -6552,7 +6583,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
vcpu->arch.pio.port, &val, 1);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
@@ -6571,6 +6602,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -6578,16 +6610,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
- int ret = kvm_skip_emulated_instruction(vcpu);
+ int ret;
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_in(vcpu, size, port);
else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_fast_pio);
@@ -7055,6 +7084,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
{
+ if (!lapic_in_kernel(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.apicv_active);
+ return;
+ }
+ if (!vcpu->arch.apicv_active)
+ return;
+
vcpu->arch.apicv_active = false;
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
@@ -8725,6 +8761,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
@@ -9005,7 +9042,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
struct page *page;
int r;
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -9026,6 +9062,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_pio_data;
if (irqchip_in_kernel(vcpu->kvm)) {
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -9033,14 +9070,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks) {
r = -ENOMEM;
goto fail_free_lapic;
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT)) {
r = -ENOMEM;
goto fail_free_mce_banks;
}
@@ -9104,7 +9142,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -9299,13 +9336,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
slot->arch.rmap[i] =
kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i])
goto out_free;
if (i == 0)
continue;
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -9348,13 +9385,13 @@ out_free:
return -ENOMEM;
}
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
- kvm_mmu_invalidate_mmio_sptes(kvm, slots);
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9421,13 +9458,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- int nr_mmu_pages = 0;
-
if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
@@ -9462,7 +9495,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 224cd0a47568..28406aa1136d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la,
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
gva_t gva, gfn_t gfn, unsigned access)
{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return;
+
/*
* If this is a shadow nested page table, the "GVA" is
* actually a nGPA.
@@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
- vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+ vcpu->arch.mmio_gen = gen;
}
static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 9baca3e054be..e7925d668b68 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -94,7 +94,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
: "m" (*(unsigned long *)buff),
"r" (zero), "0" (result));
--count;
- buff += 8;
+ buff += 8;
}
result = add32_with_carry(result>>32,
result&0xffffffff);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 462fde83b515..8dc0fc0b1382 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -24,14 +24,16 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
-static __init void *early_alloc(size_t size, int nid, bool panic)
+static __init void *early_alloc(size_t size, int nid, bool should_panic)
{
- if (panic)
- return memblock_alloc_try_nid(size, size,
- __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
- else
- return memblock_alloc_try_nid_nopanic(size, size,
+ void *ptr = memblock_alloc_try_nid(size, size,
__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+
+ if (!ptr && should_panic)
+ panic("%pS: Failed to allocate page, nid=%d from=%lx\n",
+ (void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS));
+
+ return ptr;
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index db3165714521..dc726e07d8ba 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -230,7 +230,7 @@ bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
/* Can we access it for direct reading/writing? Must be RAM: */
int valid_phys_addr_range(phys_addr_t addr, size_t count)
{
- return addr + count <= __pa(high_memory);
+ return addr + count - 1 <= __pa(high_memory - 1);
}
/* Can we access it through mmap? Must be a valid physical address: */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 12c1b7a83ed7..dfb6c4df639a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -195,15 +195,11 @@ static void __init alloc_node_data(int nid)
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
- nd_pa = memblock_phys_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
- nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
- MEMBLOCK_ALLOC_ACCESSIBLE);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
- nd_size, nid);
- return;
- }
+ pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
+ nd_size, nid);
+ return;
}
nd = __va(nd_pa);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 14e6119838a6..4c570612e24e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -738,7 +738,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
{
unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
pgprot_t old_prot, new_prot, req_prot, chk_prot;
- pte_t new_pte, old_pte, *tmp;
+ pte_t new_pte, *tmp;
enum pg_level level;
/*
@@ -781,7 +781,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
* Convert protection attributes to 4k-format, as cpa->mask* are set
* up accordingly.
*/
- old_pte = *kpte;
+
/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
req_prot = pgprot_large_2_4k(old_prot);
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4fee5c3003ed..139b28a01ce4 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -77,7 +77,7 @@ static void __init pti_print_if_secure(const char *reason)
pr_info("%s\n", reason);
}
-enum pti_mode {
+static enum pti_mode {
PTI_AUTO = 0,
PTI_FORCE_OFF,
PTI_FORCE_ON
@@ -602,7 +602,7 @@ static void pti_clone_kernel_text(void)
set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
}
-void pti_set_kernel_image_nonglobal(void)
+static void pti_set_kernel_image_nonglobal(void)
{
/*
* The identity map is created with PMDs, regardless of the
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 30a5111ae5fd..527e69b12002 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -635,6 +635,22 @@ static void quirk_no_aersid(struct pci_dev *pdev)
DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, 8, quirk_no_aersid);
+static void quirk_intel_th_dnv(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[4];
+
+ /*
+ * Denverton reports 2k of RTIT_BAR (intel_th resource 4), which
+ * appears to be 4 MB in reality.
+ */
+ if (r->end == r->start + 0x7ff) {
+ r->start = 0;
+ r->end = 0x3fffff;
+ r->flags |= IORESOURCE_UNSET;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x19e1, quirk_intel_th_dnv);
+
#ifdef CONFIG_PHYS_ADDR_T_64BIT
#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 458a0e2bcc57..a25a9fd987a9 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -449,7 +449,7 @@ void __init efi_free_boot_services(void)
*/
rm_size = real_mode_size_needed();
if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
- set_real_mode_mem(start, rm_size);
+ set_real_mode_mem(start);
start += rm_size;
size -= rm_size;
}
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index b4ab779f1d47..ac9e7bf49b66 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -141,6 +141,9 @@ void * __init prom_early_alloc(unsigned long size)
* wasted bootmem) and hand off chunks of it to callers.
*/
res = memblock_alloc(chunk_size, SMP_CACHE_BYTES);
+ if (!res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ chunk_size);
BUG_ON(!res);
prom_early_allocated += chunk_size;
memset(res, 0, chunk_size);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index d10105825d57..7dce39c8c034 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -15,15 +15,6 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
-void __init set_real_mode_mem(phys_addr_t mem, size_t size)
-{
- void *base = __va(mem);
-
- real_mode_header = (struct real_mode_header *) base;
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- base, (unsigned long long)mem, size);
-}
-
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -42,7 +33,7 @@ void __init reserve_real_mode(void)
}
memblock_reserve(mem, size);
- set_real_mode_mem(mem, size);
+ set_real_mode_mem(mem);
}
static void __init setup_real_mode(void)
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 96cb20de08af..f60501a384f9 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -37,8 +37,7 @@ REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
quiet_cmd_pasyms = PASYMS $@
- cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
- sed $(sed-pasyms) | sort | uniq > $@
+ cmd_pasyms = $(NM) $(real-prereqs) | sed $(sed-pasyms) | sort | uniq > $@
targets += pasyms.h
$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 856a85814f00..a21e1734fc1f 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2114,10 +2114,10 @@ void __init xen_relocate_p2m(void)
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ pt[idx_pte] = pfn_pte(p2m_pfn,
+ PAGE_KERNEL);
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
@@ -2125,8 +2125,7 @@ void __init xen_relocate_p2m(void)
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
+ pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys);
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
@@ -2134,7 +2133,7 @@ void __init xen_relocate_p2m(void)
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys);
pmd_phys += PAGE_SIZE;
}
n_pmd -= PTRS_PER_PUD;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 055e37e43541..95ce9b5be411 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -181,8 +181,15 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
static void * __ref alloc_p2m_page(void)
{
- if (unlikely(!slab_is_available()))
- return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ if (unlikely(!slab_is_available())) {
+ void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE);
+
+ return ptr;
+ }
return (void *)__get_free_page(GFP_KERNEL);
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d5f303c0e656..548d1e0a5ba1 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -12,6 +12,7 @@
#include <linux/memblock.h>
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
+#include <linux/memory_hotplug.h>
#include <asm/elf.h>
#include <asm/vdso.h>
@@ -589,6 +590,14 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
if (type == E820_TYPE_RAM) {
start = PAGE_ALIGN(start);
end &= ~((phys_addr_t)PAGE_SIZE - 1);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Don't allow adding memory not in E820 map while booting the
+ * system. Once the balloon driver is up it will remove that
+ * restriction again.
+ */
+ max_mem_size = end;
+#endif
}
e820__range_add(start, end - start, type);
@@ -748,6 +757,10 @@ char * __init xen_memory_setup(void)
memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
+ xen_saved_max_mem_size = max_mem_size;
+#endif
+
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
XENMEM_memory_map;
OpenPOWER on IntegriCloud