summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig10
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c166
-rw-r--r--arch/x86/kvm/cpuid.h45
-rw-r--r--arch/x86/kvm/emulate.c166
-rw-r--r--arch/x86/kvm/hyperv.c43
-rw-r--r--arch/x86/kvm/i8254.c12
-rw-r--r--arch/x86/kvm/i8259.c6
-rw-r--r--arch/x86/kvm/ioapic.c218
-rw-r--r--arch/x86/kvm/ioapic.h6
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/irq_comm.c18
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h51
-rw-r--r--arch/x86/kvm/lapic.c240
-rw-r--r--arch/x86/kvm/lapic.h19
-rw-r--r--arch/x86/kvm/mmu.h19
-rw-r--r--arch/x86/kvm/mmu/mmu.c (renamed from arch/x86/kvm/mmu.c)1204
-rw-r--r--arch/x86/kvm/mmu/page_track.c (renamed from arch/x86/kvm/page_track.c)0
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h (renamed from arch/x86/kvm/paging_tmpl.h)95
-rw-r--r--arch/x86/kvm/mmutrace.h54
-rw-r--r--arch/x86/kvm/mtrr.c8
-rw-r--r--arch/x86/kvm/pmu.c128
-rw-r--r--arch/x86/kvm/pmu.h47
-rw-r--r--arch/x86/kvm/pmu_amd.c24
-rw-r--r--arch/x86/kvm/svm.c703
-rw-r--r--arch/x86/kvm/trace.h93
-rw-r--r--arch/x86/kvm/vmx/capabilities.h11
-rw-r--r--arch/x86/kvm/vmx/evmcs.c90
-rw-r--r--arch/x86/kvm/vmx/evmcs.h5
-rw-r--r--arch/x86/kvm/vmx/nested.c894
-rw-r--r--arch/x86/kvm/vmx/nested.h22
-rw-r--r--arch/x86/kvm/vmx/ops.h93
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c74
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h4
-rw-r--r--arch/x86/kvm/vmx/vmenter.S16
-rw-r--r--arch/x86/kvm/vmx/vmx.c1102
-rw-r--r--arch/x86/kvm/vmx/vmx.h44
-rw-r--r--arch/x86/kvm/x86.c1458
-rw-r--r--arch/x86/kvm/x86.h42
39 files changed, 4610 insertions, 2627 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 840e12583b85..991019d5eee1 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -60,13 +60,11 @@ config KVM
If unsure, say N.
config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- # for perf_guest_get_msrs():
- depends on CPU_SUP_INTEL
+ tristate "KVM for Intel (and compatible) processors support"
+ depends on KVM && IA32_FEAT_CTL
---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
+ Provides support for KVM on processors equipped with Intel's VT
+ extensions, a.k.a. Virtual Machine Extensions (VMX).
To compile this as a module, choose M here: the module
will be called kvm-intel.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31ecf7a76d5a..b19ef421084d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o page_track.o debugfs.o
+ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 22c2720cd948..b1c469446b07 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void)
return xcr0;
}
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
@@ -281,8 +281,9 @@ out:
return r;
}
-static void cpuid_mask(u32 *word, int wordnum)
+static __always_inline void cpuid_mask(u32 *word, int wordnum)
{
+ reverse_cpuid_check(wordnum);
*word &= boot_cpu_data.x86_capability[wordnum];
}
@@ -304,7 +305,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
case 7:
case 0xb:
case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1f:
case 0x8000001d:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
break;
@@ -346,6 +353,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
unsigned f_la57;
+ unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
/* cpuid 7.0.ebx */
const u32 kvm_cpuid_7_0_ebx_x86_features =
@@ -357,10 +365,10 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -386,12 +394,20 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* Set LA57 based on hardware capability. */
entry->ecx |= f_la57;
entry->ecx |= f_umip;
+ entry->ecx |= f_pku;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
/*
* We emulate ARCH_CAPABILITIES in software even
* if the host doesn't support it.
@@ -473,6 +489,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
@@ -491,7 +508,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
r = -E2BIG;
- if (*nent >= maxnent)
+ if (WARN_ON(*nent >= maxnent))
goto out;
do_host_cpuid(entry, function, 0);
@@ -606,16 +623,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
*/
case 0x1f:
case 0xb: {
- int i, level_type;
+ int i;
- /* read more entries until level_type is zero */
- for (i = 1; ; ++i) {
+ /*
+ * We filled in entry[0] for CPUID(EAX=<function>,
+ * ECX=00H) above. If its level type (ECX[15:8]) is
+ * zero, then the leaf is unimplemented, and we're
+ * done. Otherwise, continue to populate entries
+ * until the level type (ECX[15:8]) of the previously
+ * added entry is zero.
+ */
+ for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
if (*nent >= maxnent)
goto out;
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
do_host_cpuid(&entry[i], function, i);
++*nent;
}
@@ -729,18 +750,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
+ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
/*
- * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
- * hardware cpuid
+ * AMD has separate bits for each SPEC_CTRL bit.
+ * arch/x86/kernel/cpu/bugs.c is kind enough to
+ * record that in cpufeatures so use them.
*/
- if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(AMD_IBPB);
- if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ if (boot_cpu_has(X86_FEATURE_IBRS))
entry->ebx |= F(AMD_IBRS);
- if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
- entry->ebx |= F(VIRT_SSBD);
- entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
- cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->ebx |= F(AMD_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->ebx |= F(AMD_SSBD);
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ entry->ebx |= F(AMD_SSB_NO);
/*
* The preference is to use SPEC CTRL MSR instead of the
* VIRT_SPEC MSR.
@@ -756,6 +783,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
case 0x8000001a:
case 0x8000001e:
break;
+ /* Support memory encryption cpuid if host supports it */
+ case 0x8000001F:
+ if (!boot_cpu_has(X86_FEATURE_SEV))
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
/*Just support up to 0xC0000004 now*/
@@ -788,14 +820,15 @@ out:
static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
int *nent, int maxnent, unsigned int type)
{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
if (type == KVM_GET_EMULATED_CPUID)
return __do_cpuid_func_emulated(entry, func, nent, maxnent);
return __do_cpuid_func(entry, func, nent, maxnent);
}
-#undef F
-
struct kvm_cpuid_param {
u32 func;
bool (*qualifier)(const struct kvm_cpuid_param *param);
@@ -952,53 +985,72 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
+ * If the basic or extended CPUID leaf requested is higher than the
+ * maximum supported basic or extended leaf, respectively, then it is
+ * out of range.
*/
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
+static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function)
{
- struct kvm_cpuid_entry2 *maxlevel;
-
- maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
- if (!maxlevel || maxlevel->eax >= function)
- return NULL;
- if (function & 0x80000000) {
- maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
- if (!maxlevel)
- return NULL;
- }
- return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+ struct kvm_cpuid_entry2 *max;
+
+ max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ return max && function <= max->eax;
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
- struct kvm_cpuid_entry2 *best;
- bool entry_found = true;
-
- best = kvm_find_cpuid_entry(vcpu, function, index);
-
- if (!best) {
- entry_found = false;
- if (!check_limit)
- goto out;
+ struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *max;
+ bool found;
- best = check_cpuid_limit(vcpu, function, index);
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ found = entry;
+ /*
+ * Intel CPUID semantics treats any query for an out-of-range
+ * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were
+ * requested. AMD CPUID semantics returns all zeroes for any
+ * undefined leaf, whether or not the leaf is in range.
+ */
+ if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
+ !cpuid_function_in_range(vcpu, function)) {
+ max = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (max) {
+ function = max->eax;
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ }
}
-
-out:
- if (best) {
- *eax = best->eax;
- *ebx = best->ebx;
- *ecx = best->ecx;
- *edx = best->edx;
- } else
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ if (function == 7 && index == 0) {
+ u64 data;
+ if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
+ (data & TSX_CTRL_CPUID_CLEAR))
+ *ebx &= ~(F(RTM) | F(HLE));
+ }
+ } else {
*eax = *ebx = *ecx = *edx = 0;
- trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
- return entry_found;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
+ return found;
}
EXPORT_SYMBOL_GPL(kvm_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a61408243..7366c618aa04 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -53,15 +53,46 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_ECX] = { 7, 0, CPUID_ECX},
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
[CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
};
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
{
- unsigned x86_leaf = x86_feature / 32;
-
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+{
+ unsigned x86_leaf = x86_feature / 32;
+
+ reverse_cpuid_check(x86_leaf);
return reverse_cpuid[x86_leaf];
}
@@ -93,15 +124,11 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
{
int *reg;
- if (x86_feature == X86_FEATURE_XSAVE &&
- !static_cpu_has(X86_FEATURE_XSAVE))
- return false;
-
reg = guest_cpuid_get_register(vcpu, x86_feature);
if (!reg)
return false;
- return *reg & bit(x86_feature);
+ return *reg & __feature_bit(x86_feature);
}
static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
@@ -110,7 +137,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8
reg = guest_cpuid_get_register(vcpu, x86_feature);
if (reg)
- *reg &= ~bit(x86_feature);
+ *reg &= ~__feature_bit(x86_feature);
}
static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 718f7d9afedc..ddbc61984227 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,6 +22,7 @@
#include "kvm_cache_regs.h"
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
@@ -310,7 +311,9 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
#define ON64(x)
#endif
-static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+typedef void (*fastop_t)(struct fastop *);
+
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
@@ -1075,8 +1078,23 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+static void emulator_get_fpu(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static void emulator_put_fpu(void)
+{
+ fpregs_unlock();
+}
+
+static void read_sse_reg(sse128_t *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
@@ -1098,11 +1116,12 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
#endif
default: BUG();
}
+ emulator_put_fpu();
}
-static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
- int reg)
+static void write_sse_reg(sse128_t *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
@@ -1124,10 +1143,12 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
#endif
default: BUG();
}
+ emulator_put_fpu();
}
-static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+static void read_mmx_reg(u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
@@ -1139,10 +1160,12 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
-static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+static void write_mmx_reg(u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
@@ -1154,6 +1177,7 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
static int em_fninit(struct x86_emulate_ctxt *ctxt)
@@ -1161,7 +1185,9 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fninit");
+ emulator_put_fpu();
return X86EMUL_CONTINUE;
}
@@ -1172,7 +1198,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstcw %0": "+m"(fcw));
+ emulator_put_fpu();
ctxt->dst.val = fcw;
@@ -1186,7 +1214,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstsw %0": "+m"(fsw));
+ emulator_put_fpu();
ctxt->dst.val = fsw;
@@ -1205,7 +1235,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
- read_sse_reg(ctxt, &op->vec_val, reg);
+ read_sse_reg(&op->vec_val, reg);
return;
}
if (ctxt->d & Mmx) {
@@ -1256,7 +1286,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = ctxt->modrm_rm;
- read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
+ read_sse_reg(&op->vec_val, ctxt->modrm_rm);
return rc;
}
if (ctxt->d & Mmx) {
@@ -1833,10 +1863,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->bytes * op->count);
break;
case OP_XMM:
- write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
+ write_sse_reg(&op->vec_val, op->addr.xmm);
break;
case OP_MM:
- write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ write_mmx_reg(&op->mm_val, op->addr.mm);
break;
case OP_NONE:
/* no writeback */
@@ -2348,12 +2378,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
#ifdef CONFIG_X86_64
- u32 eax, ebx, ecx, edx;
-
- eax = 0x80000001;
- ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- return edx & bit(X86_FEATURE_LM);
+ return ctxt->ops->guest_has_long_mode(ctxt);
#else
return false;
#endif
@@ -2770,11 +2795,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
- setup_syscalls_segments(ctxt, &cs, &ss);
-
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
+ setup_syscalls_segments(ctxt, &cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2838,12 +2862,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_PROT64)
return X86EMUL_UNHANDLEABLE;
- setup_syscalls_segments(ctxt, &cs, &ss);
-
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
+ setup_syscalls_segments(ctxt, &cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -3620,18 +3643,11 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-#define FFL(x) bit(X86_FEATURE_##x)
-
static int em_movbe(struct x86_emulate_ctxt *ctxt)
{
- u32 ebx, ecx, edx, eax = 1;
u16 tmp;
- /*
- * Check MOVBE is set in the guest-visible CPUID leaf.
- */
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- if (!(ecx & FFL(MOVBE)))
+ if (!ctxt->ops->guest_has_movbe(ctxt))
return emulate_ud(ctxt);
switch (ctxt->op_bytes) {
@@ -4029,10 +4045,7 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt)
static int check_fxsr(struct x86_emulate_ctxt *ctxt)
{
- u32 eax = 1, ebx, ecx = 0, edx;
-
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- if (!(edx & FFL(FXSR)))
+ if (!ctxt->ops->guest_has_fxsr(ctxt))
return emulate_ud(ctxt);
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
@@ -4094,8 +4107,12 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ emulator_put_fpu();
+
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -4138,6 +4155,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
if (size < __fxstate_size(16)) {
rc = fxregs_fixup(&fx_state, size);
if (rc != X86EMUL_CONTINUE)
@@ -4153,9 +4172,25 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
out:
+ emulator_put_fpu();
+
return rc;
}
+static int em_xsetbv(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ecx, edx;
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ edx = reg_read(ctxt, VCPU_REGS_RDX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -4409,6 +4444,12 @@ static const struct opcode group7_rm1[] = {
N, N, N, N, N, N,
};
+static const struct opcode group7_rm2[] = {
+ N,
+ II(ImplicitOps | Priv, em_xsetbv, xsetbv),
+ N, N, N, N, N, N,
+};
+
static const struct opcode group7_rm3[] = {
DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
@@ -4498,7 +4539,8 @@ static const struct group_dual group7 = { {
}, {
EXT(0, group7_rm0),
EXT(0, group7_rm1),
- N, EXT(0, group7_rm3),
+ EXT(0, group7_rm2),
+ EXT(0, group7_rm3),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
EXT(0, group7_rm7),
@@ -5144,7 +5186,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
else {
rc = __do_insn_fetch_bytes(ctxt, 1);
if (rc != X86EMUL_CONTINUE)
- return rc;
+ goto done;
}
switch (mode) {
@@ -5191,16 +5233,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
case 0x3e: /* DS override */
has_seg_override = true;
- ctxt->seg_override = (ctxt->b >> 3) & 3;
+ ctxt->seg_override = VCPU_SREG_DS;
break;
case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
case 0x65: /* GS override */
has_seg_override = true;
- ctxt->seg_override = ctxt->b & 7;
+ ctxt->seg_override = VCPU_SREG_GS;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -5284,10 +5338,15 @@ done_prefixes:
}
break;
case Escape:
- if (ctxt->modrm > 0xbf)
- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
- else
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
case InstrDual:
if ((ctxt->modrm >> 6) == 3)
@@ -5395,6 +5454,8 @@ done_prefixes:
ctxt->memopp->addr.mem.ea + ctxt->_eip);
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
@@ -5427,7 +5488,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ emulator_get_fpu();
rc = asm_safe("fwait");
+ emulator_put_fpu();
if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
@@ -5435,14 +5498,13 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
+static void fetch_possible_mmx_operand(struct operand *op)
{
if (op->type == OP_MM)
- read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ read_mmx_reg(&op->mm_val, op->addr.mm);
}
-static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
{
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
@@ -5518,10 +5580,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
* Now that we know the fpu is exception safe, we can fetch
* operands from it.
*/
- fetch_possible_mmx_operand(ctxt, &ctxt->src);
- fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ fetch_possible_mmx_operand(&ctxt->src);
+ fetch_possible_mmx_operand(&ctxt->src2);
if (!(ctxt->d & Mov))
- fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ fetch_possible_mmx_operand(&ctxt->dst);
}
if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
@@ -5620,14 +5682,10 @@ special_insn:
ctxt->eflags &= ~X86_EFLAGS_RF;
if (ctxt->execute) {
- if (ctxt->d & Fastop) {
- void (*fop)(struct fastop *) = (void *)ctxt->execute;
- rc = fastop(ctxt, fop);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- goto writeback;
- }
- rc = ctxt->execute(ctxt);
+ if (ctxt->d & Fastop)
+ rc = fastop(ctxt, (fastop_t)ctxt->execute);
+ else
+ rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
goto writeback;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c10a8b10b203..a86fda7a1d03 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,6 +23,7 @@
#include "ioapic.h"
#include "hyperv.h"
+#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
@@ -32,6 +33,7 @@
#include <trace/events/kvm.h>
#include "trace.h"
+#include "irq.h"
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
@@ -645,7 +647,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
.vector = stimer->config.apic_vector
};
- return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
}
static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
@@ -772,9 +776,10 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
/*
* Hyper-V SynIC auto EOI SINT's are
- * not compatible with APICV, so deactivate APICV
+ * not compatible with APICV, so request
+ * to deactivate APICV permanently.
*/
- kvm_vcpu_deactivate_apicv(vcpu);
+ kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
return 0;
@@ -806,11 +811,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 *pdata)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- *pdata = hv->hv_crash_param[index];
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
return 0;
}
@@ -849,11 +855,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 data)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- hv->hv_crash_param[index] = data;
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
return 0;
}
@@ -1055,7 +1062,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
}
@@ -1118,7 +1125,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
/*
- * Clear apic_assist portion of f(struct hv_vp_assist_page
+ * Clear apic_assist portion of struct hv_vp_assist_page
* only, there can be valuable data in the rest which needs
* to be preserved e.g. on migration.
*/
@@ -1175,7 +1182,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
}
@@ -1781,7 +1788,7 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
- uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
+ uint16_t evmcs_ver = 0;
struct kvm_cpuid_entry2 cpuid_entries[] = {
{ .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
{ .function = HYPERV_CPUID_INTERFACE },
@@ -1793,6 +1800,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
};
int i, nent = ARRAY_SIZE(cpuid_entries);
+ if (kvm_x86_ops->nested_get_evmcs_version)
+ evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
+
/* Skip NESTED_FEATURES if eVMCS is not supported */
if (!evmcs_ver)
--nent;
@@ -1849,7 +1859,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
- ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
break;
@@ -1861,7 +1877,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
if (evmcs_ver)
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
-
+ if (!cpu_smt_possible())
+ ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
/*
* Default number of spinlock retry attempts, matches
* HyperV 2016.
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4a6dc54cc12b..b24c606ac04b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -295,12 +295,24 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
if (atomic_read(&ps->reinject) == reinject)
return;
+ /*
+ * AMD SVM AVIC accelerates EOI write and does not trap.
+ * This cause in-kernel PIT re-inject mode to fail
+ * since it checks ps->irq_ack before kvm_set_irq()
+ * and relies on the ack notifier to timely queue
+ * the pt->worker work iterm and reinject the missed tick.
+ * So, deactivate APICv when PIT is in reinject mode.
+ */
if (reinject) {
+ kvm_request_apicv_update(kvm, false,
+ APICV_INHIBIT_REASON_PIT_REINJ);
/* The initial state is preserved while ps->reinject == 0. */
kvm_pit_reset_reinject(pit);
kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
} else {
+ kvm_request_apicv_update(kvm, true,
+ APICV_INHIBIT_REASON_PIT_REINJ);
kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 8b38bb4868a6..629a09ca9860 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s,
switch (addr) {
case 0x20:
case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
case 0xa0:
case 0xa1:
pic_lock(s);
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_ioport_write(&s->pics[1], addr, data);
pic_unlock(s);
break;
case 0x4d0:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index d859ae8890d0..7668fed1ce65 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -48,6 +49,11 @@
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin);
+
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
unsigned long addr,
unsigned long length)
@@ -68,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
default:
{
u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
- if (redir_index < IOAPIC_NUM_PINS)
- redir_content =
- ioapic->redirtbl[redir_index].bits;
- else
- redir_content = ~0ULL;
+ redir_content = ioapic->redirtbl[index].bits;
+ }
result = (ioapic->ioregsel & 0x1) ?
(redir_content >> 32) & 0xffffffff :
@@ -108,8 +115,9 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
union kvm_ioapic_redirect_entry *e;
e = &ioapic->redirtbl[RTC_GSI];
- if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
- e->fields.dest_mode))
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id,
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode)))
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
@@ -151,10 +159,16 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
__rtc_irq_eoi_tracking_restore_one(vcpu);
}
-static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu,
+ int vector)
{
- if (test_and_clear_bit(vcpu->vcpu_id,
- ioapic->rtc_status.dest_map.map)) {
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ (vector == dest_map->vectors[vcpu->vcpu_id]) &&
+ (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map))) {
--ioapic->rtc_status.pending_eoi;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -168,6 +182,31 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
return false;
}
+static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm) {
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ entry->fields.dest_id,
+ entry->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, entry->fields.vector))
+ continue;
+
+ /*
+ * If no longer has pending EOI in LAPICs, update
+ * EOI for this vetor.
+ */
+ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector);
+ kvm_ioapic_update_eoi_one(vcpu, ioapic,
+ entry->fields.trig_mode,
+ irq);
+ break;
+ }
+}
+
static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
int irq_level, bool line_status)
{
@@ -186,9 +225,18 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
}
/*
+ * AMD SVM AVIC accelerate EOI write and do not trap,
+ * in-kernel IOAPIC will not be able to receive the EOI.
+ * In this case, we do lazy update of the pending EOI when
+ * trying to set IOAPIC irq.
+ */
+ if (kvm_apicv_activated(ioapic->kvm))
+ ioapic_lazy_update_eoi(ioapic, irq);
+
+ /*
* Return 0 for coalesced interrupts; for edge-triggered interrupts,
* this only happens if a previous edge has not been delivered due
- * do masking. For level interrupts, the remote_irr field tells
+ * to masking. For level interrupts, the remote_irr field tells
* us if the interrupt is waiting for an EOI.
*
* RTC is special: it is edge-triggered, but userspace likes to know
@@ -250,8 +298,10 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
- if (kvm_apic_match_dest(vcpu, NULL, 0,
- e->fields.dest_id, e->fields.dest_mode) ||
+ u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id, dm) ||
kvm_apic_pending_eoi(vcpu, e->fields.vector))
__set_bit(e->fields.vector,
ioapic_handled_vectors);
@@ -271,8 +321,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
bool mask_before, mask_after;
- int old_remote_irr, old_delivery_status;
union kvm_ioapic_redirect_entry *e;
+ unsigned long vcpu_bitmap;
+ int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
switch (ioapic->ioregsel) {
case IOAPIC_REG_VERSION:
@@ -291,11 +342,14 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (index >= IOAPIC_NUM_PINS)
return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
/* Preserve read-only fields */
old_remote_irr = e->fields.remote_irr;
old_delivery_status = e->fields.delivery_status;
+ old_dest_id = e->fields.dest_id;
+ old_dest_mode = e->fields.dest_mode;
if (ioapic->ioregsel & 1) {
e->bits &= 0xffffffff;
e->bits |= (u64) val << 32;
@@ -321,7 +375,37 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
&& ioapic->irr & (1 << index))
ioapic_service(ioapic, index, false);
- kvm_make_scan_ioapic_request(ioapic->kvm);
+ if (e->fields.delivery_mode == APIC_DM_FIXED) {
+ struct kvm_lapic_irq irq;
+
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.vector = e->fields.vector;
+ irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_id = e->fields.dest_id;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+ bitmap_zero(&vcpu_bitmap, 16);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ &vcpu_bitmap);
+ if (old_dest_mode != e->fields.dest_mode ||
+ old_dest_id != e->fields.dest_id) {
+ /*
+ * Update vcpu_bitmap with vcpus specified in
+ * the previous request as well. This is done to
+ * keep ioapic_handled_vectors synchronized.
+ */
+ irq.dest_id = old_dest_id;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(
+ !!e->fields.dest_mode);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ &vcpu_bitmap);
+ }
+ kvm_make_scan_ioapic_request_mask(ioapic->kvm,
+ &vcpu_bitmap);
+ } else {
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ }
break;
}
}
@@ -339,11 +423,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.dest_id = entry->fields.dest_id;
irqe.vector = entry->fields.vector;
- irqe.dest_mode = entry->fields.dest_mode;
+ irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode);
irqe.trig_mode = entry->fields.trig_mode;
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
- irqe.shorthand = 0;
+ irqe.shorthand = APIC_DEST_NOSHORT;
irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
@@ -415,72 +499,68 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
}
#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
-
-static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
- struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin)
{
- struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
struct kvm_lapic *apic = vcpu->arch.apic;
- int i;
-
- /* RTC special handling */
- if (test_bit(vcpu->vcpu_id, dest_map->map) &&
- vector == dest_map->vectors[vcpu->vcpu_id])
- rtc_irq_eoi(ioapic, vcpu);
-
- for (i = 0; i < IOAPIC_NUM_PINS; i++) {
- union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin];
- if (ent->fields.vector != vector)
- continue;
-
- /*
- * We are dropping lock while calling ack notifiers because ack
- * notifier callbacks for assigned devices call into IOAPIC
- * recursively. Since remote_irr is cleared only after call
- * to notifiers if the same vector will be delivered while lock
- * is dropped it will be put into irr and will be delivered
- * after ack notifier returns.
- */
- spin_unlock(&ioapic->lock);
- kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
- spin_lock(&ioapic->lock);
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ spin_lock(&ioapic->lock);
- if (trigger_mode != IOAPIC_LEVEL_TRIG ||
- kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
- continue;
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ return;
- ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
- ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
- ++ioapic->irq_eoi[i];
- if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
- /*
- * Real hardware does not deliver the interrupt
- * immediately during eoi broadcast, and this
- * lets a buggy guest make slow progress
- * even if it does not correctly handle a
- * level-triggered interrupt. Emulate this
- * behavior if we detect an interrupt storm.
- */
- schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
- ioapic->irq_eoi[i] = 0;
- trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
- } else {
- ioapic_service(ioapic, i, false);
- }
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << pin))) {
+ ++ioapic->irq_eoi[pin];
+ if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[pin] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
} else {
- ioapic->irq_eoi[i] = 0;
+ ioapic_service(ioapic, pin, false);
}
+ } else {
+ ioapic->irq_eoi[pin] = 0;
}
}
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
{
+ int i;
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
spin_lock(&ioapic->lock);
- __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ rtc_irq_eoi(ioapic, vcpu, vector);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+ kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i);
+ }
spin_unlock(&ioapic->lock);
}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ea1a4e0297da..2fb2e3c80724 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -116,9 +116,6 @@ static inline int ioapic_in_kernel(struct kvm *kvm)
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode);
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
@@ -126,9 +123,6 @@ void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq,
- struct dest_map *dest_map);
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7c6233d37c64..f173ab6b407e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -113,5 +113,8 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_setup_empty_irq_routing(struct kvm *kvm);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8ecd48d31800..79afa0bb5f41 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
- if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_lowest_prio_delivery(irq)) {
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -114,13 +114,14 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
- irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(
+ !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo));
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
irq->msi_redir_hint = ((e->msi.address_lo
& MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
- irq->shorthand = 0;
+ irq->shorthand = APIC_DEST_NOSHORT;
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
@@ -416,7 +417,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ if (irq.level &&
+ kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
irq.dest_id, irq.dest_mode))
__set_bit(irq.vector, ioapic_handled_vectors);
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1cc6c47dc77e..58767020de41 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -37,22 +37,50 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return 0;
+
+ if (!kvm_register_is_available(vcpu, reg))
kvm_x86_ops->cache_reg(vcpu, reg);
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
+static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
unsigned long val)
{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return;
+
vcpu->arch.regs[reg] = val;
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_dirty(vcpu, reg);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
@@ -79,9 +107,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -109,8 +136,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- kvm_x86_ops->decache_cr3(vcpu);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 685d17c11461..afcd30d44cbb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -56,17 +56,17 @@
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
#define LAPIC_MMIO_LENGTH (1 << 12)
/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
-#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
+static bool lapic_timer_advance_dynamic __read_mostly;
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -108,11 +108,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
-{
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
return apic->vcpu->vcpu_id;
@@ -216,6 +211,9 @@ static void recalculate_apic_map(struct kvm *kvm)
if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
new->phys_map[xapic_id] = apic;
+ if (!kvm_apic_sw_enabled(apic))
+ continue;
+
ldr = kvm_lapic_get_reg(apic, APIC_LDR);
if (apic_x2apic_mode(apic)) {
@@ -258,6 +256,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
static_key_slow_dec_deferred(&apic_sw_disabled);
else
static_key_slow_inc(&apic_sw_disabled.key);
+
+ recalculate_apic_map(apic->vcpu->kvm);
}
}
@@ -554,60 +554,53 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
irq->level, irq->trig_mode, dest_map);
}
+static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
+ struct kvm_lapic_irq *irq, u32 min)
+{
+ int i, count = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (min > map->max_apic_id)
+ return 0;
+
+ for_each_set_bit(i, ipi_bitmap,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, irq, NULL);
+ }
+ }
+
+ return count;
+}
+
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
- int i;
struct kvm_apic_map *map;
- struct kvm_vcpu *vcpu;
struct kvm_lapic_irq irq = {0};
int cluster_size = op_64_bit ? 64 : 32;
- int count = 0;
+ int count;
+
+ if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
+ return -KVM_EINVAL;
irq.vector = icr & APIC_VECTOR_MASK;
irq.delivery_mode = icr & APIC_MODE_MASK;
irq.level = (icr & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr & APIC_INT_LEVELTRIG;
- if (icr & APIC_DEST_MASK)
- return -KVM_EINVAL;
- if (icr & APIC_SHORT_MASK)
- return -KVM_EINVAL;
-
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
- if (unlikely(!map)) {
- count = -EOPNOTSUPP;
- goto out;
+ count = -EOPNOTSUPP;
+ if (likely(map)) {
+ count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
+ min += cluster_size;
+ count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
}
- if (min > map->max_apic_id)
- goto out;
- /* Bits above cluster_size are masked in the caller. */
- for_each_set_bit(i, &ipi_bitmap_low,
- min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
- if (map->phys_map[min + i]) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
- }
- }
-
- min += cluster_size;
-
- if (min > map->max_apic_id)
- goto out;
-
- for_each_set_bit(i, &ipi_bitmap_high,
- min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
- if (map->phys_map[min + i]) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
- }
- }
-
-out:
rcu_read_unlock();
return count;
}
@@ -796,13 +789,13 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
}
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode)
+ int shorthand, unsigned int dest, int dest_mode)
{
struct kvm_lapic *target = vcpu->arch.apic;
u32 mda = kvm_apic_mda(vcpu, dest, source, target);
ASSERT(target);
- switch (short_hand) {
+ switch (shorthand) {
case APIC_DEST_NOSHORT:
if (dest_mode == APIC_DEST_PHYSICAL)
return kvm_apic_match_physical_addr(target, mda);
@@ -971,12 +964,12 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
}
/*
- * This routine tries to handler interrupts in posted mode, here is how
+ * This routine tries to handle interrupts in posted mode, here is how
* it deals with different cases:
* - For single-destination interrupts, handle it in posted mode
* - Else if vector hashing is enabled and it is a lowest-priority
* interrupt, handle it in posted mode and use the following mechanism
- * to find the destinaiton vCPU.
+ * to find the destination vCPU.
* 1. For lowest-priority interrupts, store all the possible
* destination vCPUs in an array.
* 2. Use "guest vector % max number of destination vCPUs" to find
@@ -1087,9 +1080,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
/* assumes that there are only KVM_APIC_INIT/SIPI */
apic->pending_events = (1UL << KVM_APIC_INIT);
- /* make sure pending_events is visible before sending
- * the request */
- smp_wmb();
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
@@ -1121,6 +1111,50 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
+/*
+ * This routine identifies the destination vcpus mask meant to receive the
+ * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
+ * out the destination vcpus array and set the bitmap or it traverses to
+ * each available vcpu to identify the same.
+ */
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap)
+{
+ struct kvm_lapic **dest_vcpu = NULL;
+ struct kvm_lapic *src = NULL;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ unsigned long bitmap;
+ int i, vcpu_idx;
+ bool ret;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
+ &bitmap);
+ if (ret) {
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dest_vcpu[i])
+ continue;
+ vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
+ __set_bit(vcpu_idx, vcpu_bitmap);
+ }
+ } else {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+ if (!kvm_apic_match_dest(vcpu, NULL,
+ irq->shorthand,
+ irq->dest_id,
+ irq->dest_mode))
+ continue;
+ __set_bit(i, vcpu_bitmap);
+ }
+ }
+ rcu_read_unlock();
+}
+
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
{
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
@@ -1193,10 +1227,8 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
}
EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
-static void apic_send_ipi(struct kvm_lapic *apic)
+static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
- u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR);
- u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -1482,26 +1514,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 ns;
+ /* Do not adjust for tiny fluctuations or large random spikes. */
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
+ return;
+
/* too early */
if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
} else {
/* too late */
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}
- if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
- if (unlikely(timer_advance_ns > 5000)) {
- timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
- }
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
@@ -1521,7 +1552,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ if (lapic_timer_advance_dynamic)
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
@@ -1537,9 +1568,9 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
struct kvm_timer *ktimer = &apic->lapic_timer;
kvm_apic_local_deliver(apic, APIC_LVTT);
- if (apic_lvtt_tscdeadline(apic))
+ if (apic_lvtt_tscdeadline(apic)) {
ktimer->tscdeadline = 0;
- if (apic_lvtt_oneshot(apic)) {
+ } else if (apic_lvtt_oneshot(apic)) {
ktimer->tscdeadline = 0;
ktimer->target_expiration = 0;
}
@@ -1593,7 +1624,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
likely(ns > apic->lapic_timer.timer_advance_ns)) {
expire = ktime_add_ns(now, ns);
expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
- hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
} else
apic_timer_expired(apic);
@@ -1909,8 +1940,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
}
case APIC_ICR:
/* No delay here, so we always clear the pending bit */
- kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
+ val &= ~(1 << 12);
+ apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+ kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
case APIC_ICR2:
@@ -1925,15 +1957,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR:
+ case APIC_LVTERR: {
/* TODO: Check vector */
+ size_t size;
+ u32 index;
+
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
-
break;
+ }
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
@@ -2147,6 +2184,21 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
pr_warn_once("APIC base relocation is unsupported by KVM");
}
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (vcpu->arch.apicv_active) {
+ /* irr_pending is always true when apicv is activated. */
+ apic->irr_pending = true;
+ apic->isr_count = 1;
+ } else {
+ apic->irr_pending = (apic_search_irr(apic) != -1);
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
+
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -2189,8 +2241,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = vcpu->arch.apicv_active;
- apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2294,17 +2345,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
apic->vcpu = vcpu;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_HARD);
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
- apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
- apic->lapic_timer.timer_advance_adjust_done = false;
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
+ lapic_timer_advance_dynamic = true;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
- apic->lapic_timer.timer_advance_adjust_done = true;
+ lapic_timer_advance_dynamic = false;
}
-
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
* thinking that APIC state has changed.
@@ -2336,14 +2386,13 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
- int r = 0;
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
- r = 1;
+ return 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- return r;
+ return 1;
+ return 0;
}
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
@@ -2449,9 +2498,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
start_apic_timer(apic);
- apic->irr_pending = true;
- apic->isr_count = vcpu->arch.apicv_active ?
- 1 : count_vectors(apic->regs + APIC_ISR);
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
kvm_x86_ops->apicv_post_state_restore(vcpu);
@@ -2479,7 +2526,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
}
/*
@@ -2702,11 +2749,14 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
return;
/*
- * INITs are latched while in SMM. Because an SMM CPU cannot
- * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
- * and delay processing of INIT until the next RSM.
+ * INITs are latched while CPU is in specific states
+ * (SMM, VMX non-root mode, SVM with GIF=0).
+ * Because a CPU cannot be in these states immediately
+ * after it has processed an INIT signal (and thus in
+ * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs
+ * and leave the INIT pending.
*/
- if (is_smm(vcpu)) {
+ if (kvm_vcpu_latch_init(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2b8b7b..ec6fbfe325cf 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,8 +10,9 @@
#define KVM_APIC_SIPI 1
#define KVM_APIC_LVT_NUM 6
-#define KVM_APIC_SHORT_MASK 0xc0000
-#define KVM_APIC_DEST_MASK 0x800
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
#define APIC_BUS_CYCLE_NS 1
#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
@@ -35,7 +36,6 @@ struct kvm_timer {
s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
- bool timer_advance_adjust_done;
};
struct kvm_lapic {
@@ -83,14 +83,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
void *data);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode);
-
+ int shorthand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
@@ -227,6 +228,9 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap);
+
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
@@ -243,4 +247,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
}
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 54c2a377795b..a647601c9e1c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -102,6 +102,19 @@ static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
kvm_get_active_pcid(vcpu));
}
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault);
+
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 err, bool prefault)
+{
+#ifdef CONFIG_RETPOLINE
+ if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
+ return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
+#endif
+ return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
+}
+
/*
* Currently, we have two sorts of write-protection, a) the first one
* write-protects guest page to sync the guest modification, b) another one is
@@ -210,4 +223,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 24843cf49579..87e9ba27ada1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -37,9 +37,10 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kthread.h>
#include <asm/page.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/e820/api.h>
#include <asm/io.h>
@@ -47,6 +48,35 @@
#include <asm/kvm_page_track.h>
#include "trace.h"
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+#ifdef CONFIG_PREEMPT_RT
+/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
+static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+#else
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+#endif
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
@@ -83,7 +113,17 @@ module_param(dbg, bool, 0644);
#define PTE_PREFETCH_NUM 8
#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs.
+ */
+#define SPTE_SPECIAL_MASK (3ULL << 52)
+#define SPTE_AD_ENABLED_MASK (0ULL << 52)
+#define SPTE_AD_DISABLED_MASK (1ULL << 52)
+#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
+#define SPTE_MMIO_MASK (3ULL << 52)
#define PT64_LEVEL_BITS 9
@@ -214,16 +254,16 @@ static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_mmio_value;
+static u64 __read_mostly shadow_mmio_access_mask;
static u64 __read_mostly shadow_present_mask;
static u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
- * Non-present SPTEs with shadow_acc_track_value set are in place for access
- * tracking.
+ * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
*/
static u64 __read_mostly shadow_acc_track_mask;
-static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
@@ -299,34 +339,63 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
{
+ BUG_ON((u64)(unsigned)access_mask != access_mask);
BUG_ON((mmio_mask & mmio_value) != mmio_value);
- shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
+ shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
+ shadow_mmio_access_mask = access_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
+}
+
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
{
return sp->role.ad_disabled;
}
+static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the log
+ * would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages. This also bypasses
+ * PML, since writes now result in a vmexit.
+ */
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+}
+
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
- return !(spte & shadow_acc_track_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+}
+
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ MMU_WARN_ON(is_mmio_spte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
@@ -349,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte)
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
-#define MMIO_SPTE_GEN_HIGH_START 52
-#define MMIO_SPTE_GEN_HIGH_END 61
+#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+
static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+ BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -375,35 +446,37 @@ static u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
- spte &= ~shadow_mmio_mask;
-
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
return gen;
}
-static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
- unsigned access)
+static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
+
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
u64 mask = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ access &= shadow_mmio_access_mask;
mask |= shadow_mmio_value | access;
mask |= gpa | shadow_nonpresent_or_rsvd_mask;
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
- page_header(__pa(sptep))->mmio_cached = true;
-
- trace_mark_mmio_spte(sptep, gfn, access, gen);
- mmu_spte_set(sptep, mask);
+ return mask;
}
-static bool is_mmio_spte(u64 spte)
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned int access)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value;
+ u64 mask = make_mmio_spte(vcpu, gfn, access);
+ unsigned int gen = get_mmio_spte_generation(mask);
+
+ access = mask & ACC_ALL;
+
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -418,12 +491,11 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
static unsigned get_mmio_spte_access(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
- return (spte & ~mask) & ~PAGE_MASK;
+ return spte & shadow_mmio_access_mask;
}
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned access)
+ kvm_pfn_t pfn, unsigned int access)
{
if (unlikely(is_noslot_pfn(pfn))) {
mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -461,7 +533,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
{
BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & shadow_acc_track_value);
+ BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -477,16 +549,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static u8 kvm_get_shadow_phys_bits(void)
{
/*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
- * in CPU detection code, but MKTME treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore for MKTME
- * we should still return physical address bits reported by CPUID.
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID.
*/
- if (!boot_cpu_has(X86_FEATURE_TME) ||
- WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
- return boot_cpu_data.x86_phys_bits;
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
- return cpuid_eax(0x80000008) & 0xff;
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
}
static void kvm_mmu_reset_all_pte_masks(void)
@@ -1164,6 +1240,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
@@ -1181,54 +1268,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_lpage_info *linfo;
-
- if (slot) {
- linfo = lpage_info_slot(gfn, slot, level);
- return !!linfo->disallow_lpage;
- }
-
- return true;
-}
-
-static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
-{
- struct kvm_memory_slot *slot;
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
-}
-
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
-{
- unsigned long page_size;
- int i, ret = 0;
-
- page_size = kvm_host_page_size(kvm, gfn);
-
- for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- if (page_size >= KVM_HPAGE_SIZE(i))
- ret = i;
- else
- break;
- }
-
- return ret;
-}
-
-static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
- bool no_dirty_log)
-{
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return false;
- if (no_dirty_log && slot->dirty_bitmap)
- return false;
-
- return true;
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
}
static struct kvm_memory_slot *
@@ -1238,40 +1282,14 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!memslot_valid_for_gpte(slot, no_dirty_log))
- slot = NULL;
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return NULL;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return NULL;
return slot;
}
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
- bool *force_pt_level)
-{
- int host_level, level, max_level;
- struct kvm_memory_slot *slot;
-
- if (unlikely(*force_pt_level))
- return PT_PAGE_TABLE_LEVEL;
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
- *force_pt_level = !memslot_valid_for_gpte(slot, true);
- if (unlikely(*force_pt_level))
- return PT_PAGE_TABLE_LEVEL;
-
- host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
- if (host_level == PT_PAGE_TABLE_LEVEL)
- return host_level;
-
- max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
-
- for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
- break;
-
- return level - 1;
-}
-
/*
* About rmap_head encoding:
*
@@ -1331,7 +1349,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
if (j != 0)
return;
if (!prev_desc && !desc->more)
- rmap_head->val = (unsigned long)desc->sptes[0];
+ rmap_head->val = 0;
else
if (prev_desc)
prev_desc->more = desc->more;
@@ -1446,7 +1464,7 @@ struct rmap_iterator {
/*
* Iteration must be started by this function. This should also be used after
* removing/dropping sptes from the rmap link because in such cases the
- * information in the itererator may not be valid.
+ * information in the iterator may not be valid.
*
* Returns sptep if found, NULL otherwise.
*/
@@ -1589,16 +1607,16 @@ static bool spte_clear_dirty(u64 *sptep)
rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
-
return mmu_spte_update(sptep, spte);
}
-static bool wrprot_ad_disabled_spte(u64 *sptep)
+static bool spte_wrprot_for_clear_dirty(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
- if (was_writable)
+ if (was_writable && !spte_ad_enabled(*sptep))
kvm_set_pfn_dirty(spte_to_pfn(*sptep));
return was_writable;
@@ -1617,10 +1635,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- if (spte_ad_enabled(*sptep))
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= spte_wrprot_for_clear_dirty(sptep);
else
- flush |= wrprot_ad_disabled_spte(sptep);
+ flush |= spte_clear_dirty(sptep);
return flush;
}
@@ -1631,6 +1649,11 @@ static bool spte_set_dirty(u64 *sptep)
rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+ /*
+ * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
+ * do not bother adding back write access to pages marked
+ * SPTE_AD_WRPROT_ONLY_MASK.
+ */
spte |= shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
@@ -2095,6 +2118,13 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2244,7 +2274,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->role.invalid) { \
+ if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2301,6 +2331,12 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return sp->role.invalid ||
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2450,7 +2486,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int direct,
- unsigned access)
+ unsigned int access)
{
union kvm_mmu_page_role role;
unsigned quadrant;
@@ -2609,7 +2645,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -2739,10 +2775,18 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- if (!sp->role.invalid)
+ /*
+ * Obsolete pages cannot be used on any vCPUs, see the comment
+ * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
+ * treats invalid shadow pages as being obsolete.
+ */
+ if (!is_obsolete_sp(kvm, sp))
kvm_reload_remote_mmus(kvm);
}
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
sp->role.invalid = 1;
return list_unstable;
}
@@ -2794,6 +2838,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
}
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ LIST_HEAD(invalid_list);
+
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
+
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -2937,7 +3001,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int level,
+ unsigned int pte_access, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -2950,7 +3014,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
sp = page_header(__pa(sptep));
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
+ else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ spte |= SPTE_AD_WRPROT_ONLY_MASK;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -2962,6 +3028,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= spte_shadow_accessed_mask(spte);
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2987,17 +3058,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
-
- /*
- * Other vcpu creates new sp in the window between
- * mapping_level() and acquiring mmu-lock. We can
- * allow guest to retry the access, the mapping can
- * be fixed if guest refault.
- */
- if (level > PT_PAGE_TABLE_LEVEL &&
- mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
- goto done;
-
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
/*
@@ -3029,13 +3089,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
-done:
return ret;
}
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned int pte_access, int write_fault, int level,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -3117,7 +3177,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
{
struct page *pages[PTE_PREFETCH_NUM];
struct kvm_memory_slot *slot;
- unsigned access = sp->role.access;
+ unsigned int access = sp->role.access;
int i, ret;
gfn_t gfn;
@@ -3182,21 +3242,129 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
+static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_memory_slot *slot)
+{
+ unsigned long hva;
+ pte_t *pte;
+ int level;
+
+ BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K ||
+ PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M ||
+ PT_PDPE_LEVEL != (int)PG_LEVEL_1G);
+
+ if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
+ return PT_PAGE_TABLE_LEVEL;
+
+ /*
+ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
+ * is not solely for performance, it's also necessary to avoid the
+ * "writable" check in __gfn_to_hva_many(), which will always fail on
+ * read-only memslots due to gfn_to_hva() assuming writes. Earlier
+ * page fault steps have already verified the guest isn't writing a
+ * read-only memslot.
+ */
+ hva = __gfn_to_hva_memslot(slot, gfn);
+
+ pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
+ if (unlikely(!pte))
+ return PT_PAGE_TABLE_LEVEL;
+
+ return level;
+}
+
+static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int max_level, kvm_pfn_t *pfnp)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+ kvm_pfn_t pfn = *pfnp;
+ kvm_pfn_t mask;
+ int level;
+
+ if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
+ return PT_PAGE_TABLE_LEVEL;
+
+ if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
+ if (!slot)
+ return PT_PAGE_TABLE_LEVEL;
+
+ max_level = min(max_level, kvm_x86_ops->get_lpage_level());
+ for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PT_PAGE_TABLE_LEVEL)
+ return PT_PAGE_TABLE_LEVEL;
+
+ level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return level;
+
+ level = min(level, max_level);
+
+ /*
+ * mmu_notifier_retry() was successful and mmu_lock is held, so
+ * the pmd can't be split from under us.
+ */
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ *pfnp = pfn & ~mask;
+
+ return level;
+}
+
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
- int map_writable, int level, kvm_pfn_t pfn,
- bool prefault)
+ int map_writable, int max_level, kvm_pfn_t pfn,
+ bool prefault, bool account_disallowed_nx_lpage)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int ret;
+ int level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
break;
@@ -3207,6 +3375,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
+ if (account_disallowed_nx_lpage)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3241,47 +3411,9 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
}
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t gfn, kvm_pfn_t *pfnp,
- int *levelp)
-{
- kvm_pfn_t pfn = *pfnp;
- int level = *levelp;
-
- /*
- * Check if it's a transparent hugepage. If this would be an
- * hugetlbfs page, level wouldn't be set to
- * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
- * here.
- */
- if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompoundMap(pfn_to_page(pfn)) &&
- !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
- unsigned long mask;
- /*
- * mmu_notifier_retry was successful and we hold the
- * mmu_lock here, so the pmd can't become splitting
- * from under us, and in turn
- * __split_huge_page_refcount() can't run from under
- * us and we can safely transfer the refcount from
- * PG_tail to PG_head as we switch the pfn to tail to
- * head.
- */
- *levelp = level = PT_DIRECTORY_LEVEL;
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- if (pfn & mask) {
- kvm_release_pfn_clean(pfn);
- pfn &= ~mask;
- kvm_get_pfn(pfn);
- *pfnp = pfn;
- }
- }
-}
-
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned access, int *ret_val)
+ kvm_pfn_t pfn, unsigned int access,
+ int *ret_val)
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
@@ -3290,7 +3422,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
}
if (unlikely(is_noslot_pfn(pfn)))
- vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+ vcpu_cache_mmio_info(vcpu, gva, gfn,
+ access & shadow_mmio_access_mask);
return false;
}
@@ -3384,7 +3517,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
* - true: let the vcpu to access on the same address again.
* - false: let the real page fault path to fix it.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
@@ -3393,9 +3526,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u64 spte = 0ull;
uint retry_count = 0;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return false;
-
if (!page_fault_can_be_fast(error_code))
return false;
@@ -3404,9 +3534,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
- if (!is_shadow_present_pte(spte) ||
- iterator.level < level)
+ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+ if (!is_shadow_present_pte(spte))
break;
sp = page_header(__pa(iterator.sptep));
@@ -3482,67 +3611,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
} while (true);
- trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
spte, fault_handled);
walk_shadow_page_lockless_end(vcpu);
return fault_handled;
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
- gfn_t gfn, bool prefault)
-{
- int r;
- int level;
- bool force_pt_level = false;
- kvm_pfn_t pfn;
- unsigned long mmu_seq;
- bool map_writable, write = error_code & PFERR_WRITE_MASK;
-
- level = mapping_level(vcpu, gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- /*
- * This path builds a PAE pagetable - so we can map
- * 2mb pages at maximum. Therefore check if the level
- * is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
-
- if (fast_page_fault(vcpu, v, level, error_code))
- return RET_PF_RETRY;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return RET_PF_RETRY;
-
- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
- return r;
-
- r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
- goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
- goto out_unlock;
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return r;
-}
-
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
struct list_head *invalid_list)
{
@@ -3833,7 +3908,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
{
if (exception)
@@ -3841,7 +3916,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
return vaddr;
}
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -3853,20 +3928,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
static bool
__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
{
- int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
-
- return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
- ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
-}
+ int bit7 = (pte >> 7) & 1;
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+ return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
}
-static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
{
- return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3890,11 +3959,11 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+ struct rsvd_bits_validate *rsvd_check;
int root, leaf;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- goto exit;
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
walk_shadow_page_lockless_begin(vcpu);
@@ -3910,8 +3979,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
if (!is_shadow_present_pte(spte))
break;
- reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
- iterator.level);
+ /*
+ * Use a bitwise-OR instead of a logical-OR to aggregate the
+ * reserved bit and EPT's invalid memtype/XWR checks to avoid
+ * adding a Jcc in the loop.
+ */
+ reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
+ __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
}
walk_shadow_page_lockless_end(vcpu);
@@ -3925,7 +3999,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
root--;
}
}
-exit:
+
*sptep = spte;
return reserved;
}
@@ -3944,7 +4018,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
- unsigned access = get_mmio_spte_access(spte);
+ unsigned int access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
return RET_PF_INVALID;
@@ -3989,9 +4063,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
struct kvm_shadow_walk_iterator iterator;
u64 spte;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
clear_sp_write_flooding_count(iterator.sptep);
@@ -4001,29 +4072,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code, bool prefault)
-{
- gfn_t gfn = gva >> PAGE_SHIFT;
- int r;
-
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return RET_PF_EMULATE;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
-
- return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
@@ -4032,11 +4082,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu->direct_map;
arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -4056,12 +4108,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return false; /* *pfn has correct page already */
if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(gva, gfn);
+ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(gva, gfn);
+ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
return true;
}
@@ -4069,11 +4121,77 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return false;
}
+static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault, int max_level, bool is_tdp)
+{
+ bool write = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool lpage_disallowed = exec && is_nx_huge_page_enabled();
+ bool map_writable;
+
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (lpage_disallowed)
+ max_level = PT_PAGE_TABLE_LEVEL;
+
+ if (fast_page_fault(vcpu, gpa, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
+ prefault, is_tdp && lpage_disallowed);
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u32 error_code, bool prefault)
+{
+ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+ return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
+ PT_DIRECTORY_LEVEL, false);
+}
+
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len)
{
int r = 1;
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
@@ -4101,72 +4219,23 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
-static bool
-check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
-{
- int page_num = KVM_PAGES_PER_HPAGE(level);
-
- gfn &= ~(page_num - 1);
-
- return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
- bool prefault)
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault)
{
- kvm_pfn_t pfn;
- int r;
- int level;
- bool force_pt_level;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- unsigned long mmu_seq;
- int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
+ int max_level;
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
+ for (max_level = PT_MAX_HUGEPAGE_LEVEL;
+ max_level > PT_PAGE_TABLE_LEVEL;
+ max_level--) {
+ int page_num = KVM_PAGES_PER_HPAGE(max_level);
+ gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return RET_PF_EMULATE;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
- PT_DIRECTORY_LEVEL);
- level = mapping_level(vcpu, gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- if (level > PT_DIRECTORY_LEVEL &&
- !check_hugepage_cache_consistency(vcpu, gfn, level))
- level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+ break;
}
- if (fast_page_fault(vcpu, gpa, level, error_code))
- return RET_PF_RETRY;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return RET_PF_RETRY;
-
- if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
- return r;
-
- r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
- goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
- goto out_unlock;
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return r;
+ return direct_page_fault(vcpu, gpa, error_code, prefault,
+ max_level, true);
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4233,10 +4302,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false;
if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU generation
+ * number. However, changing the generation number is
+ * accompanied by KVM_REQ_MMU_RELOAD, which will free
+ * the root set here and allocate a new one.
+ */
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_x86_ops->tlb_flush(vcpu, true);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
/*
@@ -4286,7 +4362,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- unsigned access, int *nr_present)
+ unsigned int access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -4849,7 +4925,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
return;
context->mmu_role.as_u64 = new_role.as_u64;
- context->page_fault = tdp_page_fault;
+ context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -5337,50 +5413,31 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
-{
- LIST_HEAD(invalid_list);
-
- if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
- return 0;
-
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
- if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
- break;
-
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
- if (!kvm_mmu_available_pages(vcpu->kvm))
- return -ENOSPC;
- return 0;
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = 0;
- enum emulation_result er;
bool direct = vcpu->arch.mmu->direct_map;
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ return RET_PF_RETRY;
+
/* With shadow page tables, fault_address contains a GVA or nGPA. */
if (vcpu->arch.mmu->direct_map) {
vcpu->arch.gpa_available = true;
- vcpu->arch.gpa_val = cr2;
+ vcpu->arch.gpa_val = cr2_or_gpa;
}
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, cr2, direct);
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = vcpu->arch.mmu->page_fault(vcpu, cr2,
- lower_32_bits(error_code),
- false);
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
+ lower_32_bits(error_code), false);
WARN_ON(r == RET_PF_INVALID);
}
@@ -5398,7 +5455,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
*/
if (vcpu->arch.mmu->direct_map &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
return 1;
}
@@ -5413,7 +5470,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* explicitly shadowing L1's page tables, i.e. unprotecting something
* for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
@@ -5428,19 +5485,8 @@ emulate:
return 1;
}
- er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_USER_EXIT:
- ++vcpu->stat.mmio_exits;
- /* fall through */
- case EMULATE_FAIL:
- return 0;
- default:
- BUG();
- }
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
+ insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -5592,13 +5638,13 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
}
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
+static void free_mmu_pages(struct kvm_mmu *mmu)
{
- free_page((unsigned long)vcpu->arch.mmu->pae_root);
- free_page((unsigned long)vcpu->arch.mmu->lm_root);
+ free_page((unsigned long)mmu->pae_root);
+ free_page((unsigned long)mmu->lm_root);
}
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
struct page *page;
int i;
@@ -5619,9 +5665,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
if (!page)
return -ENOMEM;
- vcpu->arch.mmu->pae_root = page_address(page);
+ mmu->pae_root = page_address(page);
for (i = 0; i < 4; ++i)
- vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
+ mmu->pae_root[i] = INVALID_PAGE;
return 0;
}
@@ -5629,6 +5675,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
uint i;
+ int ret;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -5646,47 +5693,124 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
- return alloc_mmu_pages(vcpu);
-}
-static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node)
-{
- struct kvm_mmu_page *sp;
- LIST_HEAD(invalid_list);
- unsigned long i;
- bool flush;
- gfn_t gfn;
+ ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
+ if (ret)
+ return ret;
- spin_lock(&kvm->mmu_lock);
+ ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
+ if (ret)
+ goto fail_allocate_root;
- if (list_empty(&kvm->arch.active_mmu_pages))
- goto out_unlock;
+ return ret;
+ fail_allocate_root:
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ return ret;
+}
- flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int nr_zapped, batch = 0;
- for (i = 0; i < slot->npages; i++) {
- gfn = slot->base_gfn + i;
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete valid page exists before a newly created page
+ * since active_mmu_pages is a FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
- for_each_valid_sp(kvm, sp, gfn) {
- if (sp->gfn != gfn)
- continue;
+ /*
+ * Skip invalid pages with a non-zero root count, zapping pages
+ * with a non-zero root count will never succeed, i.e. the page
+ * will get thrown back on active_mmu_pages and we'll get stuck
+ * in an infinite loop.
+ */
+ if (sp->role.invalid && sp->root_count)
+ continue;
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ /*
+ * No need to flush the TLB since we're only zapping shadow
+ * pages with an obsolete generation number and all vCPUS have
+ * loaded a new root, i.e. the shadow pages being zapped cannot
+ * be in active use by the guest.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
- flush = false;
- cond_resched_lock(&kvm->mmu_lock);
+
+ if (__kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
+ batch += nr_zapped;
+ goto restart;
}
}
- kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
-out_unlock:
+ /*
+ * Trigger a remote TLB flush before freeing the page tables to ensure
+ * KVM is not in the middle of a lockless shadow page table walk, which
+ * may reference the pages.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+static void kvm_mmu_zap_all_fast(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->slots_lock);
+
+ spin_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_zap_all_fast(kvm);
+
+ /*
+ * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
+ * held for the entire duration of zapping obsolete pages, it's
+ * impossible for there to be multiple invalid generations associated
+ * with *valid* shadow pages at any given time, i.e. there is exactly
+ * one valid generation and (at most) one invalid generation.
+ */
+ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+ /*
+ * Notify all vcpus to reload its shadow page table and flush TLB.
+ * Then all vcpus will switch to new shadow page table with the new
+ * mmu_valid_gen.
+ *
+ * Note: we need to do this under the protection of mmu_lock,
+ * otherwise, vcpu would purge shadow page but miss tlb flush.
+ */
+ kvm_reload_remote_mmus(kvm);
+
+ kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
}
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
+static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node)
+{
+ kvm_mmu_zap_all_fast(kvm);
+}
+
void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
@@ -5789,9 +5913,9 @@ restart:
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct &&
- !kvm_is_reserved_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ (kvm_is_zone_device_pfn(pfn) ||
+ PageCompound(pfn_to_page(pfn)))) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
@@ -5877,7 +6001,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
-static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
+void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -5886,14 +6010,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (mmio_only && !sp->mmio_cached)
- continue;
if (sp->role.invalid && sp->root_count)
continue;
- if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
- WARN_ON_ONCE(mmio_only);
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
- }
if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
@@ -5902,11 +6022,6 @@ restart:
spin_unlock(&kvm->mmu_lock);
}
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- return __kvm_mmu_zap_all(kvm, false);
-}
-
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -5928,7 +6043,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
*/
if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
- __kvm_mmu_zap_all(kvm, true);
+ kvm_mmu_zap_all_fast(kvm);
}
}
@@ -5959,16 +6074,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
- if (!kvm->arch.n_used_mmu_pages)
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
@@ -6025,16 +6148,65 @@ static void kvm_set_mmio_spte_mask(void)
* If reserved bit is not supported, clear the present bit to disable
* mmio page fault.
*/
- if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+ if (shadow_phys_bits == 52)
mask &= ~1ull;
- kvm_mmu_set_mmio_spte_mask(mask, mask);
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_mmu_zap_all_fast(kvm);
+ mutex_unlock(&kvm->slots_lock);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
}
int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
/*
* MMU roles use union aliasing which is, generally speaking, an
* undefined behavior. However, we supposedly know how compilers behave
@@ -6102,7 +6274,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- free_mmu_pages(vcpu);
+ free_mmu_pages(&vcpu->arch.root_mmu);
+ free_mmu_pages(&vcpu->arch.guest_mmu);
mmu_free_memory_caches(vcpu);
}
@@ -6113,3 +6286,116 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 3521e2d176f2..3521e2d176f2 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7d5cdb3af594..e4c8a4cbf407 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -33,7 +33,7 @@
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#define CMPXCHG cmpxchg
#else
#define CMPXCHG cmpxchg64
@@ -128,6 +128,21 @@ static inline int FNAME(is_present_gpte)(unsigned long pte)
#endif
}
+static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return false;
+#else
+ return __is_bad_mt_xwr(rsvd_check, gpte);
+#endif
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+}
+
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
@@ -175,9 +190,6 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
@@ -186,6 +198,9 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
!(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
+ if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
return false;
no_present:
@@ -291,11 +306,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
}
/*
- * Fetch a guest pte for a guest virtual address
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
+ gpa_t addr, u32 access)
{
int ret;
pt_element_t pte;
@@ -400,7 +415,7 @@ retry_walk:
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
- if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
@@ -496,7 +511,7 @@ error:
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
@@ -611,16 +626,17 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
struct guest_walker *gw,
- int write_fault, int hlevel,
- kvm_pfn_t pfn, bool map_writable, bool prefault)
+ int write_fault, int max_level,
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, ret;
- gfn_t base_gfn;
+ int top_level, hlevel, ret;
+ gfn_t base_gfn = gw->gfn;
direct_access = gw->pte_access;
@@ -636,7 +652,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
goto out_gpte_changed;
for (shadow_walk_init(&it, vcpu, addr);
@@ -665,12 +681,19 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- base_gfn = gw->gfn;
+ hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
+
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
+
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == hlevel)
break;
@@ -683,6 +706,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -750,7 +775,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
@@ -758,10 +783,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
- int level = PT_PAGE_TABLE_LEVEL;
- bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ int max_level;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -801,14 +827,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
- level = mapping_level(vcpu, walker.gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- level = min(walker.level, level);
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
- } else
- force_pt_level = true;
+ if (lpage_disallowed || is_self_change_mapping)
+ max_level = PT_PAGE_TABLE_LEVEL;
+ else
+ max_level = walker.level;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -848,10 +870,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
- if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
- r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault);
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
+ map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
@@ -928,18 +948,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
@@ -947,7 +968,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
}
#if PTTYPE != PTTYPE_EPT
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -955,6 +977,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
gpa_t gpa = UNMAPPED_GVA;
int r;
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE(vaddr >> 32);
+#endif
+
r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
if (r) {
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index d8001b4bca05..3c6522b84ff1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,16 +8,18 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define KVM_MMU_PAGE_FIELDS \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u8, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
__field(bool, unsync)
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -29,8 +31,9 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
@@ -246,13 +249,13 @@ TRACE_EVENT(
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
u64 *sptep, u64 old_spte, bool retry),
- TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
TP_STRUCT__entry(
__field(int, vcpu_id)
- __field(gva_t, gva)
+ __field(gpa_t, cr2_or_gpa)
__field(u32, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
@@ -262,7 +265,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->gva = gva;
+ __entry->cr2_or_gpa = cr2_or_gpa;
__entry->error_code = error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
@@ -270,9 +273,9 @@ TRACE_EVENT(
__entry->retry = retry;
),
- TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
" new %llx spurious %d fixed %d", __entry->vcpu_id,
- __entry->gva, __print_flags(__entry->error_code, "|",
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
kvm_mmu_trace_pferr_flags), __entry->sptep,
__entry->old_spte, __entry->new_spte,
__spte_satisfied(old_spte), __spte_satisfied(new_spte)
@@ -280,6 +283,27 @@ TRACE_EVENT(
);
TRACE_EVENT(
+ kvm_mmu_zap_all_fast,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(__u8, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
TP_ARGS(spte, kvm_gen, spte_gen),
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 25ce3edd1872..7f0059aa30e1 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
break;
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
*seg = 1;
- *unit = msr - MSR_MTRRfix16K_80000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
break;
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
*seg = 2;
- *unit = msr - MSR_MTRRfix4K_C0000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
break;
default:
return false;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 46875bbd0419..bcc6a73d6628 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -62,8 +62,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx,
- (unsigned long *)&pmu->reprogram_pmi)) {
+ if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
@@ -76,8 +75,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx,
- (unsigned long *)&pmu->reprogram_pmi)) {
+ if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
@@ -137,7 +135,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
}
pmc->perf_event = event;
- clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc_to_pmu(pmc)->event_count++;
+ clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+}
+
+static void pmc_pause_counter(struct kvm_pmc *pmc)
+{
+ u64 counter = pmc->counter;
+
+ if (!pmc->perf_event)
+ return;
+
+ /* update counter, reset event value to avoid redundant accumulation */
+ counter += perf_event_pause(pmc->perf_event, true);
+ pmc->counter = counter & pmc_bitmask(pmc);
+}
+
+static bool pmc_resume_counter(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event)
+ return false;
+
+ /* recalibrate sample period and check if it's accepted by perf core */
+ if (perf_event_period(pmc->perf_event,
+ (-pmc->counter) & pmc_bitmask(pmc)))
+ return false;
+
+ /* reuse perf_event to serve as pmc_reprogram_counter() does*/
+ perf_event_enable(pmc->perf_event);
+
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ return true;
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
@@ -154,7 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc->eventsel = eventsel;
- pmc_stop_counter(pmc);
+ pmc_pause_counter(pmc);
if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
return;
@@ -193,6 +221,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (type == PERF_TYPE_RAW)
config = eventsel & X86_RAW_EVENT_MASK;
+ if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
+ return;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = eventsel;
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
@@ -209,7 +243,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
struct kvm_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- pmc_stop_counter(pmc);
+ pmc_pause_counter(pmc);
if (!en_field || !pmc_is_enabled(pmc))
return;
@@ -224,6 +258,12 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
return;
}
+ if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
+ return;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
kvm_x86_ops->pmu_ops->find_fixed_event(idx),
!(en_field & 0x2), /* exclude user */
@@ -253,27 +293,32 @@ EXPORT_SYMBOL_GPL(reprogram_counter);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- u64 bitmask;
int bit;
- bitmask = pmu->reprogram_pmi;
-
- for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
if (unlikely(!pmc || !pmc->perf_event)) {
- clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ clear_bit(bit, pmu->reprogram_pmi);
continue;
}
reprogram_counter(pmu, bit);
}
+
+ /*
+ * Unused perf_events are only released if the corresponding MSRs
+ * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
+ * triggers KVM_REQ_PMU if cleanup is needed.
+ */
+ if (unlikely(pmu->need_cleanup))
+ kvm_pmu_cleanup(vcpu);
}
/* check if idx is a valid index to access PMU */
-int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
+ return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -323,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
+ pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
if (!pmc)
return 1;
@@ -339,7 +384,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
- return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+ return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
+ kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
+
+static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr);
+
+ if (pmc)
+ __set_bit(pmc->idx, pmu->pmc_in_use);
}
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
@@ -349,6 +404,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
}
@@ -376,9 +432,45 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
memset(pmu, 0, sizeof(*pmu));
kvm_x86_ops->pmu_ops->init(vcpu);
init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ pmu->event_count = 0;
+ pmu->need_cleanup = false;
kvm_pmu_refresh(vcpu);
}
+static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+/* Release perf_events for vPMCs that have been unused for a full time slice. */
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
+ pmu->pmc_in_use, X86_PMC_IDX_MAX);
+
+ for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
+ pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i);
+
+ if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
+ pmc_stop_counter(pmc);
+ }
+
+ bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
+}
+
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_pmu_reset(vcpu);
@@ -416,8 +508,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
*filter = tmp;
mutex_lock(&kvm->lock);
- rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
- mutex_is_locked(&kvm->lock));
+ filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
+ mutex_is_locked(&kvm->lock));
mutex_unlock(&kvm->lock);
synchronize_srcu_expedited(&kvm->srcu);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 58265f761c3b..13332984b6d5 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -2,6 +2,8 @@
#ifndef __KVM_X86_PMU_H
#define __KVM_X86_PMU_H
+#include <linux/nospec.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -25,9 +27,10 @@ struct kvm_pmu_ops {
unsigned (*find_fixed_event)(int idx);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
- struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx,
- u64 *mask);
- int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -55,12 +58,21 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
return counter & pmc_bitmask(pmc);
}
-static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+static inline void pmc_release_perf_event(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
- pmc->counter = pmc_read_counter(pmc);
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
+ pmc->current_config = 0;
+ pmc_to_pmu(pmc)->event_count--;
+ }
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ pmc_release_perf_event(pmc);
}
}
@@ -79,6 +91,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
}
+static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
+ u64 data)
+{
+ return !(pmu->global_ctrl_mask & data);
+}
+
/* returns general purpose PMC with the specified MSR. Note that it can be
* used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
* paramenter to tell them apart.
@@ -86,8 +104,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
return NULL;
}
@@ -97,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
{
int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
return NULL;
}
@@ -110,13 +136,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
void kvm_pmu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index c8388389a3b0..ce0b10fe5e2b 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -174,7 +174,7 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -184,7 +184,8 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
}
/* idx is the ECX register of RDPMC instruction */
-static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask)
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *counters;
@@ -199,13 +200,19 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u
static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
+ /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */
+ return false;
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- int ret = false;
+ struct kvm_pmc *pmc;
- ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
- get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
- return ret;
+ return pmc;
}
static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
@@ -272,6 +279,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->nr_arch_fixed_counters = 0;
pmu->global_status = 0;
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
}
static void amd_pmu_init(struct kvm_vcpu *vcpu)
@@ -285,6 +293,7 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
}
}
@@ -306,8 +315,9 @@ struct kvm_pmu_ops amd_pmu_ops = {
.find_fixed_event = amd_find_fixed_event,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
- .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
.is_valid_msr = amd_is_valid_msr,
.get_msr = amd_pmu_get_msr,
.set_msr = amd_pmu_set_msr,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d685491fce4d..bef0ba35f121 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,6 +38,7 @@
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/rwsem.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -68,10 +69,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
#define SVM_FEATURE_TSC_RATE (1 << 4)
#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
#define SVM_FEATURE_FLUSH_ASID (1 << 6)
@@ -388,6 +387,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm);
+static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
+static inline void avic_post_state_restore(struct kvm_vcpu *vcpu);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_intercept(struct vcpu_svm *svm);
@@ -420,9 +421,13 @@ enum {
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+static int sev_flush_asids(void);
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
static unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
struct enc_region {
@@ -736,8 +741,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -770,7 +781,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -780,17 +791,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
if (!svm->next_rip) {
- if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
- EMULATE_DONE)
- printk(KERN_DEBUG "%s: NOP\n", __func__);
- return;
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ } else {
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ pr_err("%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
+ kvm_rip_write(vcpu, svm->next_rip);
}
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
-
- kvm_rip_write(vcpu, svm->next_rip);
svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
}
static void svm_queue_exception(struct kvm_vcpu *vcpu)
@@ -821,7 +832,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- skip_emulated_instruction(&svm->vcpu);
+ (void)skip_emulated_instruction(&svm->vcpu);
rip = kvm_rip_read(&svm->vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
@@ -1231,11 +1242,15 @@ static __init int sev_hardware_setup(void)
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = cpuid_edx(0x8000001F);
- /* Initialize SEV ASID bitmap */
+ /* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_asid_bitmap)
return 1;
+ sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap)
+ return 1;
+
status = kmalloc(sizeof(*status), GFP_KERNEL);
if (!status)
return 1;
@@ -1269,11 +1284,11 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
pause_filter_count_grow,
pause_filter_count_max);
- if (control->pause_filter_count != old)
+ if (control->pause_filter_count != old) {
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- trace_kvm_ple_window_grow(vcpu->vcpu_id,
- control->pause_filter_count, old);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
}
static void shrink_ple_window(struct kvm_vcpu *vcpu)
@@ -1287,11 +1302,52 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
pause_filter_count,
pause_filter_count_shrink,
pause_filter_count);
- if (control->pause_filter_count != old)
+ if (control->pause_filter_count != old) {
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
- trace_kvm_ple_window_shrink(vcpu->vcpu_id,
- control->pause_filter_count, old);
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static __init int svm_hardware_setup(void)
@@ -1348,6 +1404,8 @@ static __init int svm_hardware_setup(void)
}
}
+ svm_adjust_mmio_mask();
+
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
@@ -1414,8 +1472,12 @@ static __exit void svm_hardware_unsetup(void)
{
int cpu;
- if (svm_sev_enabled())
+ if (svm_sev_enabled()) {
bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ sev_flush_asids();
+ }
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1485,7 +1547,10 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -1542,6 +1607,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_XSETBV);
+ set_intercept(svm, INTERCEPT_RDPRU);
set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
@@ -1668,23 +1734,28 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_init_access_page(struct kvm_vcpu *vcpu)
+static int avic_update_access_page(struct kvm *kvm, bool activate)
{
- struct kvm *kvm = vcpu->kvm;
int ret = 0;
mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page_done)
+ /*
+ * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
+ * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
+ * memory region. So, we need to ensure that kvm->mm == current->mm.
+ */
+ if ((kvm->arch.apic_access_page_done == activate) ||
+ (kvm->mm != current->mm))
goto out;
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
+ activate ? PAGE_SIZE : 0);
if (ret)
goto out;
- kvm->arch.apic_access_page_done = true;
+ kvm->arch.apic_access_page_done = activate;
out:
mutex_unlock(&kvm->slots_lock);
return ret;
@@ -1692,21 +1763,24 @@ out:
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- int ret;
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- ret = avic_init_access_page(vcpu);
- if (ret)
- return ret;
-
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
if (!svm->vcpu.arch.apic->regs)
return -EINVAL;
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ ret = avic_update_access_page(vcpu->kvm, true);
+ if (ret)
+ return ret;
+ }
+
svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
@@ -1714,7 +1788,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (!entry)
return -EINVAL;
- new_entry = READ_ONCE(*entry);
new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
@@ -1725,25 +1798,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
-static void __sev_asid_free(int asid)
+static void sev_asid_free(int asid)
{
struct svm_cpu_data *sd;
int cpu, pos;
+ mutex_lock(&sev_bitmap_lock);
+
pos = asid - 1;
- clear_bit(pos, sev_asid_bitmap);
+ __set_bit(pos, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
sd = per_cpu(svm_data, cpu);
sd->sev_vmcbs[pos] = NULL;
}
-}
-
-static void sev_asid_free(struct kvm *kvm)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- __sev_asid_free(sev->asid);
+ mutex_unlock(&sev_bitmap_lock);
}
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -1760,10 +1830,12 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
/* deactivate handle */
data->handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
sev_guest_deactivate(data, NULL);
+ up_read(&sev_deactivate_lock);
- wbinvd_on_all_cpus();
- sev_guest_df_flush(NULL);
kfree(data);
decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
@@ -1912,7 +1984,7 @@ static void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(&kvm->lock);
sev_unbind_asid(kvm, sev->handle);
- sev_asid_free(kvm);
+ sev_asid_free(sev->asid);
}
static void avic_vm_destroy(struct kvm *kvm)
@@ -1993,6 +2065,18 @@ free_avic:
return err;
}
+static int svm_vm_init(struct kvm *kvm)
+{
+ if (avic) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ kvm_apicv_init(kvm, avic);
+ return 0;
+}
+
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
@@ -2091,7 +2175,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
u32 dummy;
u32 eax = 1;
- vcpu->arch.microcode_version = 0x01000065;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
@@ -2128,7 +2211,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *page;
@@ -2137,36 +2220,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
struct page *nested_msrpm_pages;
int err;
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!svm->vcpu.arch.user_fpu) {
- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
- err = -ENOMEM;
- goto free_partial_svm;
- }
-
- svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!svm->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_user_fpu;
- }
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
err = -ENOMEM;
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
- goto uninit;
+ goto out;
msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
@@ -2187,7 +2247,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
*/
- svm->avic_is_running = true;
+ if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
+ svm->avic_is_running = true;
svm->nested.hsave = page_address(hsave_page);
@@ -2203,9 +2264,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->asid_generation = 0;
init_vmcb(svm);
- svm_init_osvw(&svm->vcpu);
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
- return &svm->vcpu;
+ return 0;
free_page4:
__free_page(hsave_page);
@@ -2215,16 +2277,8 @@ free_page2:
__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
__free_page(page);
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
-free_partial_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
out:
- return ERR_PTR(err);
+ return err;
}
static void svm_clear_current_vmcb(struct vmcb *vmcb)
@@ -2250,10 +2304,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2324,6 +2374,8 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
avic_set_running(vcpu, true);
}
@@ -2363,7 +2415,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
}
}
@@ -2516,10 +2568,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
-static void svm_decache_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
@@ -2769,17 +2817,18 @@ static int gp_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 error_code = svm->vmcb->control.exit_info_1;
- int er;
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = kvm_emulate_instruction(vcpu,
- EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
- if (er == EMULATE_USER_EXIT)
- return 0;
- else if (er != EMULATE_DONE)
+ /*
+ * VMware backdoor emulation on #GP interception only handles IN{S},
+ * OUT{S}, and RDPMC, none of which generate a non-zero error code.
+ */
+ if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
}
static bool is_erratum_383(void)
@@ -2877,7 +2926,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string)
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -2904,13 +2953,11 @@ static int nop_on_interception(struct vcpu_svm *svm)
static int halt_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_emulate_hypercall(&svm->vcpu);
}
@@ -3589,9 +3636,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
mark_all_dirty(svm->vmcb);
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static int nested_svm_vmrun(struct vcpu_svm *svm)
{
- int rc;
+ int ret;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
@@ -3600,13 +3647,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
vmcb_gpa = svm->vmcb->save.rax;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
- if (rc) {
- if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return false;
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(&svm->vcpu);
}
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
nested_vmcb = map.hva;
if (!nested_vmcb_checks(nested_vmcb)) {
@@ -3617,7 +3667,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
kvm_vcpu_unmap(&svm->vcpu, &map, true);
- return false;
+ return ret;
}
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
@@ -3661,7 +3711,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
- return true;
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+ }
+
+ return ret;
}
static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
@@ -3698,7 +3757,6 @@ static int vmload_interception(struct vcpu_svm *svm)
nested_vmcb = map.hva;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
@@ -3725,7 +3783,6 @@ static int vmsave_interception(struct vcpu_svm *svm)
nested_vmcb = map.hva;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
@@ -3739,27 +3796,7 @@ static int vmrun_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- /* Save rip after vmrun instruction */
- kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
-
- if (!nested_svm_vmrun(svm))
- return 1;
-
- if (!nested_svm_vmrun_msrpm(svm))
- goto failed;
-
- return 1;
-
-failed:
-
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-
- return 1;
+ return nested_svm_vmrun(svm);
}
static int stgi_interception(struct vcpu_svm *svm)
@@ -3776,7 +3813,6 @@ static int stgi_interception(struct vcpu_svm *svm)
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@ -3792,7 +3828,6 @@ static int clgi_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
disable_gif(svm);
@@ -3817,7 +3852,6 @@ static int invlpga_interception(struct vcpu_svm *svm)
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
@@ -3840,13 +3874,18 @@ static int xsetbv_interception(struct vcpu_svm *svm)
u32 index = kvm_rcx_read(&svm->vcpu);
if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
return 1;
}
+static int rdpru_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
static int task_switch_interception(struct vcpu_svm *svm)
{
u16 tss_selector;
@@ -3899,25 +3938,20 @@ static int task_switch_interception(struct vcpu_svm *svm)
if (reason != TASK_SWITCH_GATE ||
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
- (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
- skip_emulated_instruction(&svm->vcpu);
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!skip_emulated_instruction(&svm->vcpu))
+ return 0;
+ }
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- svm->vcpu.run->internal.ndata = 0;
- return 0;
- }
- return 1;
+ return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
}
static int cpuid_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
return kvm_emulate_cpuid(&svm->vcpu);
}
@@ -3934,7 +3968,7 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0);
kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3942,13 +3976,12 @@ static int invlpg_interception(struct vcpu_svm *svm)
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0);
}
static int rsm_interception(struct vcpu_svm *svm)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu,
- rsm_ins_bytes, 2) == EMULATE_DONE;
+ return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
}
static int rdpmc_interception(struct vcpu_svm *svm)
@@ -4192,6 +4225,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
@@ -4233,23 +4268,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static int rdmsr_interception(struct vcpu_svm *svm)
{
- u32 ecx = kvm_rcx_read(&svm->vcpu);
- struct msr_data msr_info;
-
- msr_info.index = ecx;
- msr_info.host_initiated = false;
- if (svm_get_msr(&svm->vcpu, &msr_info)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- } else {
- trace_kvm_msr_read(ecx, msr_info.data);
-
- kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff);
- kvm_rdx_write(&svm->vcpu, msr_info.data >> 32);
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
+ return kvm_emulate_rdmsr(&svm->vcpu);
}
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
@@ -4293,16 +4312,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
return 1;
svm->spec_ctrl = data;
-
if (!data)
break;
@@ -4326,13 +4345,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ return 1;
if (!data)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- if (is_guest_mode(vcpu))
- break;
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
@@ -4439,23 +4457,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
static int wrmsr_interception(struct vcpu_svm *svm)
{
- struct msr_data msr;
- u32 ecx = kvm_rcx_read(&svm->vcpu);
- u64 data = kvm_read_edx_eax(&svm->vcpu);
-
- msr.data = data;
- msr.index = ecx;
- msr.host_initiated = false;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (kvm_set_msr(&svm->vcpu, &msr)) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- } else {
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
+ return kvm_emulate_wrmsr(&svm->vcpu);
}
static int msr_interception(struct vcpu_svm *svm)
@@ -4470,6 +4472,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
{
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
+
+ /*
+ * For AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ */
+ svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
@@ -4551,9 +4561,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & KVM_APIC_SHORT_MASK,
+ icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
- icrl & KVM_APIC_DEST_MASK);
+ icrl & APIC_DEST_MASK);
if (m && !avic_vcpu_is_running(vcpu))
kvm_vcpu_wake_up(vcpu);
@@ -4637,6 +4647,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (ldr == svm->ldr_reg)
return 0;
@@ -4644,7 +4655,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
avic_invalidate_logical_id_entry(vcpu);
if (ldr)
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
+ ret = avic_ldr_write(vcpu, id, ldr);
if (!ret)
svm->ldr_reg = ldr;
@@ -4656,8 +4667,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
{
u64 *old, *new;
struct vcpu_svm *svm = to_svm(vcpu);
- u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
- u32 id = (apic_id_reg >> 24) & 0xff;
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (vcpu->vcpu_id == id)
return 0;
@@ -4769,7 +4779,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ ret = kvm_emulate_instruction(&svm->vcpu, 0);
}
return ret;
@@ -4836,6 +4846,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
+ [SVM_EXIT_RDPRU] = rdpru_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
@@ -4966,7 +4977,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = control->exit_info_2;
}
-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -5024,13 +5036,33 @@ static int handle_exit(struct kvm_vcpu *vcpu)
__func__, svm->vmcb->control.exit_int_info,
exit_code);
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
return 1;
+ } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ || !svm_exit_handlers[exit_code]) {
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.data[0] = exit_code;
+ return 0;
}
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(svm);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(svm);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(svm);
+ else if (exit_code == SVM_EXIT_HLT)
+ return halt_interception(svm);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(svm);
+#endif
return svm_exit_handlers[exit_code](svm);
}
@@ -5126,8 +5158,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu) ||
- kvm_vcpu_apicv_active(vcpu))
+ if (svm_nested_virtualize_tpr(vcpu))
return;
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@ -5144,30 +5175,79 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
return;
}
-static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
- return avic && irqchip_split(vcpu->kvm);
}
-static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
}
-static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
{
+ if (!avic || !lapic_in_kernel(vcpu))
+ return;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_request_apicv_update(vcpu->kvm, activate,
+ APICV_INHIBIT_REASON_IRQWIN);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+}
+
+static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ if (activate)
+ ret = amd_iommu_activate_guest_mode(ir->data);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
}
-/* Note: Currently only used by Hyper-V. */
static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_post_state_restore(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- else
+ } else {
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
mark_dirty(vmcb, VMCB_AVIC);
+
+ svm_set_pi_irte_mode(vcpu, activated);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5275,7 +5355,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
__func__, irq.vector);
return -1;
@@ -5329,6 +5410,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
* 3. APIC virtialization is disabled for the vcpu.
+ * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
kvm_vcpu_apicv_active(&svm->vcpu)) {
@@ -5452,9 +5534,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
- return;
-
/*
* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
* 1, because that's a separate STGI/VMRUN intercept. The next time we
@@ -5464,6 +5543,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
* window under the assumption that the hardware will set the GIF.
*/
if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, we need to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ */
+ svm_toggle_avic_for_irq_window(vcpu, false);
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -5666,7 +5752,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
- kvm_load_guest_xcr0(vcpu);
+ kvm_load_guest_xsave_state(vcpu);
if (lapic_in_kernel(vcpu) &&
vcpu->arch.apic->lapic_timer.timer_advance_ns)
@@ -5816,7 +5902,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
- kvm_put_guest_xcr0(vcpu);
+ kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -5925,6 +6011,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES);
+
/* Update nrips enabled cache */
svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
@@ -5932,19 +6022,34 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
return;
guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+
+ /*
+ * Currently, AVIC does not work with nested virtualization.
+ * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+ */
+ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_NESTED);
}
+#define F feature_bit
+
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
case 0x1:
if (avic)
- entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ entry->ecx &= ~F(X2APIC);
break;
case 0x80000001:
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
break;
+ case 0x80000008:
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ break;
case 0x8000000A:
entry->eax = 1; /* SVM revision 1 */
entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
@@ -5955,18 +6060,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
/* Support next_rip if host supports it */
if (boot_cpu_has(X86_FEATURE_NRIPS))
- entry->edx |= SVM_FEATURE_NRIP;
+ entry->edx |= F(NRIPS);
/* Support NPT for the guest if enabled */
if (npt_enabled)
- entry->edx |= SVM_FEATURE_NPT;
-
- break;
- case 0x8000001F:
- /* Support memory encryption cpuid if host supports it */
- if (boot_cpu_has(X86_FEATURE_SEV))
- cpuid(0x8000001f, &entry->eax, &entry->ebx,
- &entry->ecx, &entry->edx);
+ entry->edx |= F(NPT);
}
}
@@ -5993,7 +6091,7 @@ static bool svm_mpx_supported(void)
static bool svm_xsaves_supported(void)
{
- return false;
+ return boot_cpu_has(X86_FEATURE_XSAVES);
}
static bool svm_umip_emulated(void)
@@ -6011,6 +6109,11 @@ static bool svm_has_wbinvd_exit(void)
return true;
}
+static bool svm_pku_supported(void)
+{
+ return false;
+}
+
#define PRE_EX(exit) { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
@@ -6068,6 +6171,7 @@ static const struct __x86_intercept {
[x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
[x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
[x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
};
#undef PRE_EX
@@ -6195,9 +6299,12 @@ out:
return ret;
}
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion *exit_fastpath)
{
-
+ if (!is_guest_mode(vcpu) &&
+ to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE)
+ *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -6294,18 +6401,73 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static int sev_flush_asids(void)
+{
+ int ret, error;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ wbinvd_on_all_cpus();
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+ return ret;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(void)
+{
+ int pos;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ pos = find_next_bit(sev_reclaim_asid_bitmap,
+ max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return false;
+
+ if (sev_flush_asids())
+ return false;
+
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ max_sev_asid);
+ bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+
+ return true;
+}
+
static int sev_asid_new(void)
{
+ bool retry = true;
int pos;
+ mutex_lock(&sev_bitmap_lock);
+
/*
* SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
*/
+again:
pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid)
+ if (pos >= max_sev_asid) {
+ if (retry && __sev_recycle_asids()) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
return -EBUSY;
+ }
+
+ __set_bit(pos, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
- set_bit(pos, sev_asid_bitmap);
return pos + 1;
}
@@ -6333,7 +6495,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
return 0;
e_free:
- __sev_asid_free(asid);
+ sev_asid_free(asid);
return ret;
}
@@ -6343,12 +6505,6 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
int asid = sev_get_asid(kvm);
int ret;
- wbinvd_on_all_cpus();
-
- ret = sev_guest_df_flush(error);
- if (ret)
- return ret;
-
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -7129,19 +7285,6 @@ failed:
return ret;
}
-static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
-{
- /* Not supported */
- return 0;
-}
-
-static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
- uint16_t *vmcs_version)
-{
- /* Intel-only feature */
- return -ENODEV;
-}
-
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
{
unsigned long cr4 = kvm_read_cr4(vcpu);
@@ -7200,6 +7343,37 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
return false;
}
+static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * TODO: Last condition latch INIT signals on vCPU when
+ * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
+ * To properly emulate the INIT intercept, SVM should implement
+ * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit()
+ * there if an INIT signal is pending.
+ */
+ return !gif_set(svm) ||
+ (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
+}
+
+static bool svm_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_NESTED) |
+ BIT(APICV_INHIBIT_REASON_IRQWIN) |
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ);
+
+ return supported & BIT(bit);
+}
+
+static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+{
+ avic_update_access_page(kvm, activate);
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7217,7 +7391,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.vm_alloc = svm_vm_alloc,
.vm_free = svm_vm_free,
- .vm_init = avic_vm_init,
+ .vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
.prepare_guest_switch = svm_prepare_guest_switch,
@@ -7236,7 +7410,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
- .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -7275,8 +7448,9 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .get_enable_apicv = svm_get_enable_apicv,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
+ .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
@@ -7300,6 +7474,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+ .pku_supported = svm_pku_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
@@ -7332,10 +7507,12 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
- .nested_enable_evmcs = nested_enable_evmcs,
- .nested_get_evmcs_version = nested_get_evmcs_version,
+ .nested_enable_evmcs = NULL,
+ .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+
+ .apic_init_signal_blocked = svm_apic_init_signal_blocked,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b5c831e79094..f194dd058470 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -232,17 +232,20 @@ TRACE_EVENT(kvm_exit,
__field( u32, isa )
__field( u64, info1 )
__field( u64, info2 )
+ __field( unsigned int, vcpu_id )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
__entry->guest_rip = kvm_rip_read(vcpu);
__entry->isa = isa;
+ __entry->vcpu_id = vcpu->vcpu_id;
kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
&__entry->info2);
),
- TP_printk("reason %s rip 0x%lx info %llx %llx",
+ TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx",
+ __entry->vcpu_id,
(__entry->isa == KVM_ISA_VMX) ?
__print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
__print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
@@ -887,36 +890,27 @@ TRACE_EVENT(kvm_pml_full,
TP_printk("vcpu %d: PML full", __entry->vcpu_id)
);
-TRACE_EVENT(kvm_ple_window,
- TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
- TP_ARGS(grow, vcpu_id, new, old),
+TRACE_EVENT(kvm_ple_window_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old),
+ TP_ARGS(vcpu_id, new, old),
TP_STRUCT__entry(
- __field( bool, grow )
__field( unsigned int, vcpu_id )
- __field( int, new )
- __field( int, old )
+ __field( unsigned int, new )
+ __field( unsigned int, old )
),
TP_fast_assign(
- __entry->grow = grow;
__entry->vcpu_id = vcpu_id;
__entry->new = new;
__entry->old = old;
),
- TP_printk("vcpu %u: ple_window %d (%s %d)",
- __entry->vcpu_id,
- __entry->new,
- __entry->grow ? "grow" : "shrink",
- __entry->old)
+ TP_printk("vcpu %u old %u new %u (%s)",
+ __entry->vcpu_id, __entry->old, __entry->new,
+ __entry->old < __entry->new ? "growed" : "shrinked")
);
-#define trace_kvm_ple_window_grow(vcpu_id, new, old) \
- trace_kvm_ple_window(true, vcpu_id, new, old)
-#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
- trace_kvm_ple_window(false, vcpu_id, new, old)
-
TRACE_EVENT(kvm_pvclock_update,
TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
TP_ARGS(vcpu_id, pvclock),
@@ -1297,6 +1291,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+TRACE_EVENT(kvm_apicv_update_request,
+ TP_PROTO(bool activate, unsigned long bit),
+ TP_ARGS(activate, bit),
+
+ TP_STRUCT__entry(
+ __field(bool, activate)
+ __field(unsigned long, bit)
+ ),
+
+ TP_fast_assign(
+ __entry->activate = activate;
+ __entry->bit = bit;
+ ),
+
+ TP_printk("%s bit=%lu",
+ __entry->activate ? "activate" : "deactivate",
+ __entry->bit)
+);
+
/*
* Tracepoint for AMD AVIC
*/
@@ -1320,7 +1333,7 @@ TRACE_EVENT(kvm_avic_incomplete_ipi,
__entry->index = index;
),
- TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u",
__entry->vcpu, __entry->icrh, __entry->icrl,
__entry->id, __entry->index)
);
@@ -1345,7 +1358,7 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
__entry->vec = vec;
),
- TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x",
__entry->vcpu,
__entry->offset,
__print_symbolic(__entry->offset, kvm_trace_symbol_apic),
@@ -1462,6 +1475,46 @@ TRACE_EVENT(kvm_hv_send_ipi_ex,
__entry->vector, __entry->format,
__entry->valid_bank_mask)
);
+
+TRACE_EVENT(kvm_pv_tlb_flush,
+ TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb),
+ TP_ARGS(vcpu_id, need_flush_tlb),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( bool, need_flush_tlb )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->need_flush_tlb = need_flush_tlb;
+ ),
+
+ TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id,
+ __entry->need_flush_tlb ? "true" : "false")
+);
+
+/*
+ * Tracepoint for failed nested VMX VM-Enter.
+ */
+TRACE_EVENT(kvm_nested_vmenter_failed,
+ TP_PROTO(const char *msg, u32 err),
+ TP_ARGS(msg, err),
+
+ TP_STRUCT__entry(
+ __field(const char *, msg)
+ __field(u32, err)
+ ),
+
+ TP_fast_assign(
+ __entry->msg = msg;
+ __entry->err = err;
+ ),
+
+ TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
+ __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d6664ee3d127..283bdb7071af 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
}
+static inline bool vmx_pku_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PKU);
+}
+
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -247,6 +252,12 @@ static inline bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
}
+static inline bool vmx_waitpkg_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
static inline bool cpu_has_vmx_tsc_scaling(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 72359709cdc1..303813423c3e 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -7,6 +7,7 @@
#include "evmcs.h"
#include "vmcs.h"
#include "vmx.h"
+#include "trace.h"
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
@@ -346,26 +347,93 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
return 0;
}
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
+{
+ u32 ctl_low = (u32)*pdata;
+ u32 ctl_high = (u32)(*pdata >> 32);
+
+ /*
+ * Hyper-V 2016 and 2019 try using these features even when eVMCS
+ * is enabled but there are no corresponding fields.
+ */
+ switch (msr_index) {
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ break;
+ }
+
+ *pdata = ctl_low | ((u64)ctl_high << 32);
+}
+
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
+{
+ int ret = 0;
+ u32 unsupp_ctl;
+
+ unsupp_ctl = vmcs12->pin_based_vm_exec_control &
+ EVMCS1_UNSUPPORTED_PINCTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported pin-based VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->secondary_vm_exec_control &
+ EVMCS1_UNSUPPORTED_2NDEXEC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported secondary VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_exit_controls &
+ EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-exit controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_entry_controls &
+ EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-entry controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-function controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled;
vmx->nested.enlightened_vmcs_enabled = true;
if (vmcs_version)
*vmcs_version = nested_get_evmcs_version(vcpu);
- /* We don't support disabling the feature for simplicity. */
- if (evmcs_already_enabled)
- return 0;
-
- vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
- vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
- vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
- vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
- vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
-
return 0;
}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 39a24eec8884..6de47f2569c9 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -10,6 +10,7 @@
#include "capabilities.h"
#include "vmcs.h"
+#include "vmcs12.h"
struct vmcs_config;
@@ -178,6 +179,8 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
vp_ap->enlighten_vmentry = 1;
}
@@ -199,5 +202,7 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ced9fba32598..3589cd3c0fcc 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -10,6 +10,7 @@
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
+#include "pmu.h"
#include "trace.h"
#include "x86.h"
@@ -19,6 +20,14 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
static bool __read_mostly nested_early_check = 0;
module_param(nested_early_check, bool, S_IRUGO);
+#define CC(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
/*
* Hyper-V requires all of these, so mark them as supported even though
* they are just treated the same as all-context.
@@ -190,6 +199,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
}
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
@@ -239,7 +258,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.cached_shadow_vmcs12 = NULL;
/* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
@@ -430,8 +449,8 @@ static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
return 0;
- if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
- !page_address_valid(vcpu, vmcs12->io_bitmap_b))
+ if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
+ CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
return -EINVAL;
return 0;
@@ -443,7 +462,7 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return 0;
- if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
+ if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
return -EINVAL;
return 0;
@@ -455,7 +474,7 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
return 0;
- if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
+ if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
return -EINVAL;
return 0;
@@ -525,7 +544,8 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
-static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
+{
int msr;
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
@@ -688,7 +708,7 @@ static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !page_address_valid(vcpu, vmcs12->apic_access_addr))
+ CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
return -EINVAL;
else
return 0;
@@ -707,16 +727,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
* If virtualize x2apic mode is enabled,
* virtualize apic access must be disabled.
*/
- if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
- nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
return -EINVAL;
/*
* If virtual interrupt delivery is enabled,
* we must exit on external interrupts.
*/
- if (nested_cpu_has_vid(vmcs12) &&
- !nested_exit_on_intr(vcpu))
+ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
return -EINVAL;
/*
@@ -727,15 +746,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
* bits 5:0 of posted_intr_desc_addr should be zero.
*/
if (nested_cpu_has_posted_intr(vmcs12) &&
- (!nested_cpu_has_vid(vmcs12) ||
- !nested_exit_intr_ack_set(vcpu) ||
- (vmcs12->posted_intr_nv & 0xff00) ||
- (vmcs12->posted_intr_desc_addr & 0x3f) ||
- (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
+ (CC(!nested_cpu_has_vid(vmcs12)) ||
+ CC(!nested_exit_intr_ack_set(vcpu)) ||
+ CC((vmcs12->posted_intr_nv & 0xff00)) ||
+ CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
+ CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
- if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
return -EINVAL;
return 0;
@@ -759,10 +778,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count,
- vmcs12->vm_exit_msr_load_addr) ||
- nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count,
- vmcs12->vm_exit_msr_store_addr))
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_load_count,
+ vmcs12->vm_exit_msr_load_addr)) ||
+ CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_store_count,
+ vmcs12->vm_exit_msr_store_addr)))
return -EINVAL;
return 0;
@@ -771,8 +792,9 @@ static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count,
- vmcs12->vm_entry_msr_load_addr))
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_entry_msr_load_count,
+ vmcs12->vm_entry_msr_load_addr)))
return -EINVAL;
return 0;
@@ -784,8 +806,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
if (!nested_cpu_has_pml(vmcs12))
return 0;
- if (!nested_cpu_has_ept(vmcs12) ||
- !page_address_valid(vcpu, vmcs12->pml_address))
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->pml_address)))
return -EINVAL;
return 0;
@@ -794,8 +816,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
- !nested_cpu_has_ept(vmcs12))
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
+ !nested_cpu_has_ept(vmcs12)))
return -EINVAL;
return 0;
}
@@ -803,8 +825,8 @@ static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
- !nested_cpu_has_ept(vmcs12))
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
+ !nested_cpu_has_ept(vmcs12)))
return -EINVAL;
return 0;
}
@@ -815,8 +837,8 @@ static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
if (!nested_cpu_has_shadow_vmcs(vmcs12))
return 0;
- if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
- !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
+ if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
+ CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
return -EINVAL;
return 0;
@@ -826,12 +848,12 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
/* x2APIC MSR accesses are not allowed */
- if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
+ if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
return -EINVAL;
- if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
- e->index == MSR_IA32_UCODE_REV)
+ if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
+ CC(e->index == MSR_IA32_UCODE_REV))
return -EINVAL;
- if (e->reserved != 0)
+ if (CC(e->reserved != 0))
return -EINVAL;
return 0;
}
@@ -839,9 +861,9 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
- if (e->index == MSR_FS_BASE ||
- e->index == MSR_GS_BASE ||
- e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ if (CC(e->index == MSR_FS_BASE) ||
+ CC(e->index == MSR_GS_BASE) ||
+ CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
nested_vmx_msr_check_common(vcpu, e))
return -EINVAL;
return 0;
@@ -850,24 +872,40 @@ static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
- if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
nested_vmx_msr_check_common(vcpu, e))
return -EINVAL;
return 0;
}
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
/*
* Load guest's/host's msr at nested entry/exit.
* return 0 for success, entry index for failure.
+ *
+ * One of the failure modes for MSR load/store is when a list exceeds the
+ * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
+ * as possible, process all valid entries before failing rather than precheck
+ * for a capacity violation.
*/
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
u32 i;
struct vmx_msr_entry e;
- struct msr_data msr;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
- msr.host_initiated = false;
for (i = 0; i < count; i++) {
+ if (unlikely(i >= max_msr_list_size))
+ goto fail;
+
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) {
pr_debug_ratelimited(
@@ -881,9 +919,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- msr.index = e.index;
- msr.data = e.value;
- if (kvm_set_msr(vcpu, &msr)) {
+ if (kvm_set_msr(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -895,48 +931,141 @@ fail:
return i + 1;
}
+static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
+ u32 msr_index,
+ u64 *data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If the L0 hypervisor stored a more accurate value for the TSC that
+ * does not include the time taken for emulation of the L2->L1
+ * VM-exit in L0, use the more accurate value.
+ */
+ if (msr_index == MSR_IA32_TSC) {
+ int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
+
+ if (index >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[index].value;
+
+ *data = kvm_read_l1_tsc(vcpu, val);
+ return true;
+ }
+ }
+
+ if (kvm_get_msr(vcpu, msr_index, data)) {
+ pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
+ msr_index);
+ return false;
+ }
+ return true;
+}
+
+static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
+ struct vmx_msr_entry *e)
+{
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(*e),
+ e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(*e));
+ return false;
+ }
+ if (nested_vmx_store_msr_check(vcpu, e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e->index, e->reserved);
+ return false;
+ }
+ return true;
+}
+
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
+ u64 data;
u32 i;
struct vmx_msr_entry e;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- struct msr_data msr_info;
- if (kvm_vcpu_read_guest(vcpu,
- gpa + i * sizeof(e),
- &e, 2 * sizeof(u32))) {
- pr_debug_ratelimited(
- "%s cannot read MSR entry (%u, 0x%08llx)\n",
- __func__, i, gpa + i * sizeof(e));
+ if (unlikely(i >= max_msr_list_size))
return -EINVAL;
- }
- if (nested_vmx_store_msr_check(vcpu, &e)) {
- pr_debug_ratelimited(
- "%s check failed (%u, 0x%x, 0x%x)\n",
- __func__, i, e.index, e.reserved);
+
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
return -EINVAL;
- }
- msr_info.host_initiated = false;
- msr_info.index = e.index;
- if (kvm_get_msr(vcpu, &msr_info)) {
- pr_debug_ratelimited(
- "%s cannot read MSR (%u, 0x%x)\n",
- __func__, i, e.index);
+
+ if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
return -EINVAL;
- }
+
if (kvm_vcpu_write_guest(vcpu,
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
- &msr_info.data, sizeof(msr_info.data))) {
+ &data, sizeof(data))) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
- __func__, i, e.index, msr_info.data);
+ __func__, i, e.index, data);
return -EINVAL;
}
}
return 0;
}
+static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 count = vmcs12->vm_exit_msr_store_count;
+ u64 gpa = vmcs12->vm_exit_msr_store_addr;
+ struct vmx_msr_entry e;
+ u32 i;
+
+ for (i = 0; i < count; i++) {
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return false;
+
+ if (e.index == msr_index)
+ return true;
+ }
+ return false;
+}
+
+static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
+ u32 msr_index)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
+ bool in_vmcs12_store_list;
+ int msr_autostore_index;
+ bool in_autostore_list;
+ int last;
+
+ msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
+ in_autostore_list = msr_autostore_index >= 0;
+ in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
+
+ if (in_vmcs12_store_list && !in_autostore_list) {
+ if (autostore->nr == NR_LOADSTORE_MSRS) {
+ /*
+ * Emulated VMEntry does not fail here. Instead a less
+ * accurate value will be returned by
+ * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
+ * instead of reading the value from the vmcs02 VMExit
+ * MSR-store area.
+ */
+ pr_warn_ratelimited(
+ "Not enough msr entries in msr_autostore. Can't add msr %x\n",
+ msr_index);
+ return;
+ }
+ last = autostore->nr++;
+ autostore->val[last].index = msr_index;
+ } else if (!in_vmcs12_store_list && in_autostore_list) {
+ last = --autostore->nr;
+ autostore->val[msr_autostore_index] = autostore->val[last];
+ }
+}
+
static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long invalid_mask;
@@ -946,16 +1075,16 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
}
/*
- * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
- * emulating VM entry into a guest with EPT enabled.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
+ * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
+ * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
+ * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
+ * @entry_failure_code.
*/
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
u32 *entry_failure_code)
{
if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
- if (!nested_cr3_valid(vcpu, cr3)) {
+ if (CC(!nested_cr3_valid(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -965,7 +1094,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* must not be dereferenced.
*/
if (is_pae_paging(vcpu) && !nested_ept) {
- if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
}
@@ -976,7 +1105,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
kvm_mmu_new_cr3(vcpu, cr3, false);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_init_mmu(vcpu, false);
@@ -988,7 +1117,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* populated by L2 differently than TLB entries populated
* by L1.
*
- * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ * If L0 uses EPT, L1 and L2 run with different EPTP because
+ * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
+ * are tagged with different EPTP.
*
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
* with different VPID (L1 entries are tagged with vmx->vpid
@@ -998,7 +1129,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- return nested_cpu_has_ept(vmcs12) ||
+ return enable_ept ||
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
}
@@ -1009,17 +1140,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
}
-
-static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
-{
- return fixed_bits_valid(control, low, high);
-}
-
-static inline u64 vmx_control_msr(u32 low, u32 high)
-{
- return low | ((u64)high << 32);
-}
-
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
{
superset &= mask;
@@ -1862,7 +1982,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
}
/*
- * Clean fields data can't de used on VMLAUNCH and when we switch
+ * Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
if (from_launch || evmcs_gpa_changed)
@@ -1993,7 +2113,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
* addresses are constant (for vmcs02), the counts can change based
* on L2's behavior, e.g. switching to/from long mode.
*/
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
@@ -2043,11 +2163,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* EXEC CONTROLS
*/
exec_control = vmx_exec_control(vmx); /* L0's desires */
- exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+ vmx->nested.l1_tpr_threshold = -1;
if (exec_control & CPU_BASED_TPR_SHADOW)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
#ifdef CONFIG_X86_64
@@ -2085,6 +2206,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC);
@@ -2259,6 +2381,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
}
+ /*
+ * Make sure the msr_autostore list is up to date before we set the
+ * count in the vmcs02.
+ */
+ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -2355,9 +2484,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (nested_cpu_has_ept(vmcs12))
nested_ept_init_mmu_context(vcpu);
- else if (nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
- vmx_flush_tlb(vcpu, true);
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -2392,6 +2518,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
entry_failure_code))
return -EINVAL;
+ /*
+ * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
+ * on nested VM-Exit, which can occur without actually running L2 and
+ * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+ * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
+ * transition to HLT instead of running L2.
+ */
+ if (enable_ept)
+ vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
+
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
is_pae_paging(vcpu)) {
@@ -2404,6 +2540,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
return 0;
@@ -2411,12 +2552,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
{
- if (!nested_cpu_has_nmi_exiting(vmcs12) &&
- nested_cpu_has_virtual_nmis(vmcs12))
+ if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12)))
return -EINVAL;
- if (!nested_cpu_has_virtual_nmis(vmcs12) &&
- nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
+ if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
return -EINVAL;
return 0;
@@ -2430,11 +2571,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
/* Check for memory type validity */
switch (address & VMX_EPTP_MT_MASK) {
case VMX_EPTP_MT_UC:
- if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
return false;
break;
case VMX_EPTP_MT_WB:
- if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
return false;
break;
default:
@@ -2442,16 +2583,16 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
}
/* only 4 levels page-walk length are valid */
- if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
+ if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
return false;
/* Reserved bits should not be set */
- if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+ if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
if (address & VMX_EPTP_AD_ENABLE_BIT) {
- if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
return false;
}
@@ -2466,21 +2607,21 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- vmx->nested.msrs.pinbased_ctls_low,
- vmx->nested.msrs.pinbased_ctls_high) ||
- !vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- vmx->nested.msrs.procbased_ctls_low,
- vmx->nested.msrs.procbased_ctls_high))
+ if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high)) ||
+ CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high)))
return -EINVAL;
if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
- !vmx_control_verify(vmcs12->secondary_vm_exec_control,
- vmx->nested.msrs.secondary_ctls_low,
- vmx->nested.msrs.secondary_ctls_high))
+ CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)))
return -EINVAL;
- if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) ||
+ if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
@@ -2491,7 +2632,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
- (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
+ CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
return -EINVAL;
if (!nested_cpu_has_preemption_timer(vmcs12) &&
@@ -2499,17 +2640,17 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
return -EINVAL;
if (nested_cpu_has_ept(vmcs12) &&
- !valid_ept_address(vcpu, vmcs12->ept_pointer))
+ CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
return -EINVAL;
if (nested_cpu_has_vmfunc(vmcs12)) {
- if (vmcs12->vm_function_control &
- ~vmx->nested.msrs.vmfunc_controls)
+ if (CC(vmcs12->vm_function_control &
+ ~vmx->nested.msrs.vmfunc_controls))
return -EINVAL;
if (nested_cpu_has_eptp_switching(vmcs12)) {
- if (!nested_cpu_has_ept(vmcs12) ||
- !page_address_valid(vcpu, vmcs12->eptp_list_address))
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
return -EINVAL;
}
}
@@ -2525,10 +2666,10 @@ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx_control_verify(vmcs12->vm_exit_controls,
- vmx->nested.msrs.exit_ctls_low,
- vmx->nested.msrs.exit_ctls_high) ||
- nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))
+ if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high)) ||
+ CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
return -EINVAL;
return 0;
@@ -2542,9 +2683,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx_control_verify(vmcs12->vm_entry_controls,
- vmx->nested.msrs.entry_ctls_low,
- vmx->nested.msrs.entry_ctls_high))
+ if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high)))
return -EINVAL;
/*
@@ -2564,31 +2705,31 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
/* VM-entry interruption-info field: interruption type */
- if (intr_type == INTR_TYPE_RESERVED ||
- (intr_type == INTR_TYPE_OTHER_EVENT &&
- !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ if (CC(intr_type == INTR_TYPE_RESERVED) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT &&
+ !nested_cpu_supports_monitor_trap_flag(vcpu)))
return -EINVAL;
/* VM-entry interruption-info field: vector */
- if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
- (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
- (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
return -EINVAL;
/* VM-entry interruption-info field: deliver error code */
should_have_error_code =
intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
x86_exception_has_error_code(vector);
- if (has_error_code != should_have_error_code)
+ if (CC(has_error_code != should_have_error_code))
return -EINVAL;
/* VM-entry exception error code */
- if (has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
+ if (CC(has_error_code &&
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
- if (intr_info & INTR_INFO_RESVD_BITS_MASK)
+ if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
return -EINVAL;
/* VM-entry instruction length */
@@ -2596,9 +2737,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
case INTR_TYPE_SOFT_INTR:
case INTR_TYPE_PRIV_SW_EXCEPTION:
- if ((vmcs12->vm_entry_instruction_len > 15) ||
- (vmcs12->vm_entry_instruction_len == 0 &&
- !nested_cpu_has_zero_length_injection(vcpu)))
+ if (CC(vmcs12->vm_entry_instruction_len > 15) ||
+ CC(vmcs12->vm_entry_instruction_len == 0 &&
+ CC(!nested_cpu_has_zero_length_injection(vcpu))))
return -EINVAL;
}
}
@@ -2617,6 +2758,9 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
+ if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+ return nested_evmcs_check_controls(vmcs12);
+
return 0;
}
@@ -2625,43 +2769,62 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
{
bool ia32e;
- if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
- !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
- !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+ if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
+ CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
+ CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
return -EINVAL;
- if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
- is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
return -EINVAL;
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
- !kvm_pat_valid(vmcs12->host_ia32_pat))
+ CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
return -EINVAL;
- ia32e = (vmcs12->vm_exit_controls &
- VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
-
- if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
- vmcs12->host_cs_selector == 0 ||
- vmcs12->host_tr_selector == 0 ||
- (vmcs12->host_ss_selector == 0 && !ia32e))
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->host_ia32_perf_global_ctrl)))
return -EINVAL;
#ifdef CONFIG_X86_64
- if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
- is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
- is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
- is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
- is_noncanonical_address(vmcs12->host_tr_base, vcpu))
- return -EINVAL;
+ ia32e = !!(vcpu->arch.efer & EFER_LMA);
+#else
+ ia32e = false;
#endif
+ if (ia32e) {
+ if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
+ CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ return -EINVAL;
+ } else {
+ if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
+ CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
+ CC((vmcs12->host_rip) >> 32))
+ return -EINVAL;
+ }
+
+ if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_cs_selector == 0) ||
+ CC(vmcs12->host_tr_selector == 0) ||
+ CC(vmcs12->host_ss_selector == 0 && !ia32e))
+ return -EINVAL;
+
+ if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2669,9 +2832,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
* the host address-space size VM-exit control.
*/
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
- if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
return -EINVAL;
}
@@ -2688,16 +2851,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
- if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
+ if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
return -EINVAL;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+ if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
return -EINVAL;
shadow = map.hva;
- if (shadow->hdr.revision_id != VMCS12_REVISION ||
- shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+ if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
+ CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
r = -EINVAL;
kvm_vcpu_unmap(vcpu, &map, false);
@@ -2709,8 +2872,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
*/
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
{
- if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
return -EINVAL;
return 0;
@@ -2724,12 +2887,16 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
*exit_qual = ENTRY_FAIL_DEFAULT;
- if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
- !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
+ if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
+ CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+ CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
- !kvm_pat_valid(vmcs12->guest_ia32_pat))
+ CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
return -EINVAL;
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
@@ -2737,6 +2904,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-entry control is 1, the following checks
* are performed on the field for the IA32_EFER MSR:
@@ -2749,16 +2921,16 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
if (to_vmx(vcpu)->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
- if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
- ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
- ((vmcs12->guest_cr0 & X86_CR0_PG) &&
- ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
+ CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
return -EINVAL;
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
- (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
return -EINVAL;
if (nested_check_guest_non_reg_state(vmcs12))
@@ -2841,9 +3013,13 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
if (vm_fail) {
+ u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
+
preempt_enable();
- WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
- VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ trace_kvm_nested_vmenter_failed(
+ "early hardware check VM-instruction error: ", error);
+ WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
@@ -2868,10 +3044,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
-static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12);
-
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2887,23 +3060,22 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* to it so we can release it later.
*/
if (vmx->nested.apic_access_page) { /* shouldn't happen */
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
if (!is_error_page(page)) {
vmx->nested.apic_access_page = page;
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
}
}
@@ -2948,6 +3120,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ return true;
}
/*
@@ -2986,13 +3159,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
/*
* If from_vmentry is false, this is being called from state restore (either RSM
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
-+ *
-+ * Returns:
-+ * 0 - success, i.e. proceed with actual VMEnter
-+ * 1 - consistency check VMExit
-+ * -1 - consistency check VMFail
+ *
+ * Returns:
+ * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_ENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_ENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
*/
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -3001,7 +3176,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
u32 exit_qual;
evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -3035,11 +3210,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- nested_get_vmcs12_pages(vcpu);
+ if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return -1;
+ return NVMX_VMENTRY_VMFAIL;
}
if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
@@ -3047,7 +3223,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
}
enter_guest_mode(vcpu);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
@@ -3103,7 +3279,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* returned as far as L1 is concerned. It will only return (and set
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
- return 0;
+ return NVMX_VMENTRY_SUCCESS;
/*
* A failed consistency check that leads to a VMExit during L1's
@@ -3111,7 +3287,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* 26.7 "VM-entry failures during or after loading guest state".
*/
vmentry_fail_vmexit_guest_mode:
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
@@ -3119,14 +3295,14 @@ vmentry_fail_vmexit:
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
if (!from_vmentry)
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
vmcs12->exit_qualification = exit_qual;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
}
/*
@@ -3136,9 +3312,9 @@ vmentry_fail_vmexit:
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
struct vmcs12 *vmcs12;
+ enum nvmx_vmentry_status status;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
- int ret;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -3198,13 +3374,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the nested entry.
*/
vmx->nested.nested_run_pending = 1;
- ret = nested_vmx_enter_non_root_mode(vcpu, true);
- vmx->nested.nested_run_pending = !ret;
- if (ret > 0)
- return 1;
- else if (ret)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ status = nested_vmx_enter_non_root_mode(vcpu, true);
+ if (unlikely(status != NVMX_VMENTRY_SUCCESS))
+ goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3228,18 +3400,27 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
*/
if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
- !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
- !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
+ !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
+ !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
return kvm_vcpu_halt(vcpu);
}
return 1;
+
+vmentry_failed:
+ vmx->nested.nested_run_pending = 0;
+ if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
+ return 0;
+ if (status == NVMX_VMENTRY_VMEXIT)
+ return 1;
+ WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
- * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
* This function returns the new value we should put in vmcs12.guest_cr0.
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
@@ -3395,12 +3576,50 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
}
+/*
+ * Returns true if a debug trap is pending delivery.
+ *
+ * In KVM, debug traps bear an exception payload. As such, the class of a #DB
+ * exception may be inferred from the presence of an exception payload.
+ */
+static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == DB_VECTOR &&
+ vcpu->arch.exception.payload;
+}
+
+/*
+ * Certain VM-exits set the 'pending debug exceptions' field to indicate a
+ * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
+ * represents these debug traps with a payload that is said to be compatible
+ * with the 'pending debug exceptions' field, write the payload to the VMCS
+ * field if a VM-exit is delivered before the debug trap.
+ */
+static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
+{
+ if (vmx_pending_dbg_trap(vcpu))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vcpu->arch.exception.payload);
+}
+
static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
bool block_nested_events =
vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+ return 0;
+ }
if (vcpu->arch.exception.pending &&
nested_vmx_check_exception(vcpu, &exit_qual)) {
@@ -3801,8 +4020,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
- vmcs12->host_ia32_perf_global_ctrl);
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
/* Set L1 segment info according to Intel SDM
27.5.2 Loading Host Segment and Descriptor-Table Registers */
@@ -3889,7 +4108,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_msr_entry g, h;
- struct msr_data msr;
gpa_t gpa;
u32 i, j;
@@ -3922,7 +4140,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
nested_ept_uninit_mmu_context(vcpu);
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3949,7 +4167,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
* from the guest value. The intent is to stuff host state as
* silently as possible, not to fully process the exit load list.
*/
- msr.host_initiated = false;
for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
@@ -3979,9 +4196,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- msr.index = h.index;
- msr.data = h.value;
- if (kvm_set_msr(vcpu, &msr)) {
+ if (kvm_set_msr(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4015,7 +4230,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
@@ -4053,6 +4268,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (vmx->nested.l1_tpr_threshold != -1)
+ vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -4060,15 +4277,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
- } else if (!nested_cpu_has_ept(vmcs12) &&
- nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- vmx_flush_tlb(vcpu, true);
}
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
@@ -4268,6 +4481,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx;
+
+ if (!nested_vmx_allowed(vcpu))
+ return;
+
+ vmx = to_vmx(vcpu);
+ if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ vmx->nested.msrs.entry_ctls_high |=
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high |=
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &=
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high &=
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ }
+}
+
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
{
gva_t gva;
@@ -4375,8 +4609,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
gpa_t vmptr;
uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
- | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
+ | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
* The Intel VMX Instruction Reference lists a bunch of bits that are
@@ -4466,7 +4700,12 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
{
if (!nested_vmx_check_permission(vcpu))
return 1;
+
free_nested(vcpu);
+
+ /* Process a latched INIT during time CPU was in VMX operation */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return nested_vmx_succeed(vcpu);
}
@@ -4516,8 +4755,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_succeed(vcpu);
}
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
-
/* Emulate the VMLAUNCH instruction */
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
{
@@ -4533,35 +4770,32 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
static int handle_vmread(struct kvm_vcpu *vcpu)
{
- unsigned long field;
- u64 field_value;
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- int len;
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
+ u64 value;
gva_t gva = 0;
- struct vmcs12 *vmcs12;
short offset;
+ int len;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu))
- vmcs12 = get_vmcs12(vcpu);
- else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
/* Decode instruction info and find the field to read */
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
if (offset < 0)
@@ -4571,24 +4805,26 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 field_value */
- field_value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
- if (vmx_instruction_info & (1u << 10)) {
- kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
- field_value);
+ if (instr_info & BIT(10)) {
+ kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
} else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, len, &gva))
+ instr_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
+ if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
}
return nested_vmx_succeed(vcpu);
@@ -4620,46 +4856,58 @@ static bool is_shadow_field_ro(unsigned long field)
static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
unsigned long field;
- int len;
+ short offset;
gva_t gva;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ int len;
- /* The value to write might be 32 or 64 bits, depending on L1's long
+ /*
+ * The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the appropriate number of
+ * bit (value), and then copies only the appropriate number of
* bits into the vmcs12 field.
*/
- u64 field_value = 0;
- struct x86_exception e;
- struct vmcs12 *vmcs12;
- short offset;
+ u64 value = 0;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (vmx->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMWRITE sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_readl(vcpu,
- (((vmx_instruction_info) >> 3) & 0xf));
+ if (instr_info & BIT(10))
+ value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, false, len, &gva))
+ instr_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
}
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/*
* If the vCPU supports "VMWRITE to any supported field in the
* VMCS," then the "read-only" fields are actually read/write.
@@ -4669,29 +4917,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- if (!is_guest_mode(vcpu)) {
- vmcs12 = get_vmcs12(vcpu);
-
- /*
- * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
- * vmcs12, else we may crush a field or consume a stale value.
- */
- if (!is_shadow_field_rw(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- } else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/*
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
@@ -4702,9 +4933,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
* the stripped down value, L2 sees the full value as stored by KVM).
*/
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
- field_value &= 0x1f0ff;
+ value &= 0x1f0ff;
- vmcs12_write_any(vmcs12, field, offset, field_value);
+ vmcs12_write_any(vmcs12, field, offset, value);
/*
* Do not track vmcs12 dirty-state if in guest-mode as we actually
@@ -4721,7 +4952,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
preempt_disable();
vmcs_load(vmx->vmcs01.shadow_vmcs);
- __vmcs_writel(field, field_value);
+ __vmcs_writel(field, value);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
vmcs_load(vmx->loaded_vmcs->vmcs);
@@ -5259,8 +5490,9 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return false;
if (unlikely(vmx->fail)) {
- pr_info_ratelimited("%s failed vm entry %x\n", __func__,
- vmcs_read32(VM_INSTRUCTION_ERROR));
+ trace_kvm_nested_vmenter_failed(
+ "hardware VM-instruction error: ",
+ vmcs_read32(VM_INSTRUCTION_ERROR));
return true;
}
@@ -5303,10 +5535,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return false;
case EXIT_REASON_TRIPLE_FAULT:
return true;
- case EXIT_REASON_PENDING_INTERRUPT:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
+ case EXIT_REASON_INTERRUPT_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
case EXIT_REASON_NMI_WINDOW:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
+ return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
case EXIT_REASON_TASK_SWITCH:
return true;
case EXIT_REASON_CPUID:
@@ -5420,6 +5652,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_ENCLS:
/* SGX is never exposed to L1 */
return false;
+ case EXIT_REASON_UMWAIT:
+ case EXIT_REASON_TPAUSE:
+ return nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
default:
return true;
}
@@ -5695,7 +5931,7 @@ error_guest_mode:
return ret;
}
-void nested_vmx_vcpu_setup(void)
+void nested_vmx_set_vmcs_shadowing_bitmap(void)
{
if (enable_shadow_vmcs) {
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
@@ -5790,8 +6026,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
msrs->procbased_ctls_high &=
- CPU_BASED_VIRTUAL_INTR_PENDING |
- CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
@@ -5976,23 +6212,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
init_vmcs_shadow_fields();
}
- exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear,
- exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
- exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld,
- exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst,
- exit_handlers[EXIT_REASON_VMREAD] = handle_vmread,
- exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume,
- exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite,
- exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff,
- exit_handlers[EXIT_REASON_VMON] = handle_vmon,
- exit_handlers[EXIT_REASON_INVEPT] = handle_invept,
- exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid,
- exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc,
+ exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
+ exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
+ exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
+ exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
+ exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
+ exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
+ exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmon;
+ exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
+ exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
+ exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
kvm_x86_ops->check_nested_events = vmx_check_nested_events;
kvm_x86_ops->get_nested_state = vmx_get_nested_state;
kvm_x86_ops->set_nested_state = vmx_set_nested_state;
- kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
+ kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 187d39bf0bf1..fc874d4ead0f 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -6,14 +6,25 @@
#include "vmcs12.h"
#include "vmx.h"
+/*
+ * Status returned by nested_vmx_enter_non_root_mode():
+ */
+enum nvmx_vmentry_status {
+ NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */
+ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */
+ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */
+ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */
+};
+
void vmx_leave_nested(struct kvm_vcpu *vcpu);
void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
bool apicv);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
-void nested_vmx_vcpu_setup(void);
+void nested_vmx_set_vmcs_shadowing_bitmap(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry);
bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
u32 exit_intr_info, unsigned long exit_qualification);
@@ -22,6 +33,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
@@ -245,7 +257,7 @@ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
return ((val & fixed1) | fixed0) == val;
}
-static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -259,7 +271,7 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -267,7 +279,7 @@ static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 2200fb698dd0..45eaedee2ac0 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -11,8 +11,13 @@
#include "vmcs.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
-#define __ex_clear(x, reg) \
- ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
+
+asmlinkage void vmread_error(unsigned long field, bool fault);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
static __always_inline void vmcs_check16(unsigned long field)
{
@@ -62,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
- asm volatile (__ex_clear("vmread %1, %0", "%k0")
- : "=r"(value) : "r"(field));
+ asm volatile("1: vmread %2, %1\n\t"
+ ".byte 0x3e\n\t" /* branch taken hint */
+ "ja 3f\n\t"
+ "mov %2, %%" _ASM_ARG1 "\n\t"
+ "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t"
+ "2: call vmread_error\n\t"
+ "xor %k1, %k1\n\t"
+ "3:\n\t"
+
+ ".pushsection .fixup, \"ax\"\n\t"
+ "4: mov %2, %%" _ASM_ARG1 "\n\t"
+ "mov $1, %%" _ASM_ARG2 "\n\t"
+ "jmp 2b\n\t"
+ ".popsection\n\t"
+ _ASM_EXTABLE(1b, 4b)
+ : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
return value;
}
@@ -103,21 +122,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
return __vmcs_readl(field);
}
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
- printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
- field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
- dump_stack();
-}
+#define vmx_asm1(insn, op1, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1 : "cc" : error, fault); \
+ return; \
+error: \
+ insn##_error(error_args); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1, op2 : "cc" : error, fault); \
+ return; \
+error: \
+ insn##_error(error_args); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
{
- bool error;
-
- asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
- : CC_OUT(na) (error) : "r"(field), "rm"(value));
- if (unlikely(error))
- vmwrite_error(field, value);
+ vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
}
static __always_inline void vmcs_write16(unsigned long field, u16 value)
@@ -182,28 +219,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
static inline void vmcs_clear(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- bool error;
- asm volatile (__ex("vmclear %1") CC_SET(na)
- : CC_OUT(na) (error) : "m"(phys_addr));
- if (unlikely(error))
- printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
- vmcs, phys_addr);
+ vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
}
static inline void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- bool error;
if (static_branch_unlikely(&enable_evmcs))
return evmcs_load(phys_addr);
- asm volatile (__ex("vmptrld %1") CC_SET(na)
- : CC_OUT(na) (error) : "m"(phys_addr));
- if (unlikely(error))
- printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
- vmcs, phys_addr);
+ vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
}
static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
@@ -213,11 +240,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
u64 rsvd : 48;
u64 gva;
} operand = { vpid, 0, gva };
- bool error;
- asm volatile (__ex("invvpid %2, %1") CC_SET(na)
- : CC_OUT(na) (error) : "r"(ext), "m"(operand));
- BUG_ON(error);
+ vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
}
static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
@@ -225,11 +249,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
struct {
u64 eptp, gpa;
} operand = {eptp, gpa};
- bool error;
- asm volatile (__ex("invept %2, %1") CC_SET(na)
- : CC_OUT(na) (error) : "r"(ext), "m"(operand));
- BUG_ON(error);
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
}
static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 4dea0e0e7e39..fd21cdb10b79 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -15,6 +15,7 @@
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
+#include "nested.h"
#include "pmu.h"
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
@@ -46,6 +47,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
if (old_ctrl == new_ctrl)
continue;
+ __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
reprogram_fixed_counter(pmc, new_ctrl, i);
}
@@ -84,10 +86,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
static unsigned intel_find_fixed_event(int idx)
{
- if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
return PERF_COUNT_HW_MAX;
- return intel_arch_events[fixed_pmc_events[idx]].event_type;
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -111,7 +117,7 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -122,22 +128,26 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
(fixed && idx >= pmu->nr_arch_fixed_counters);
}
-static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
- unsigned idx, u64 *mask)
+static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
+ unsigned int num_counters;
idx &= ~(3u << 30);
- if (!fixed && idx >= pmu->nr_arch_gp_counters)
- return NULL;
- if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
return NULL;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
*mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
-
- return &counters[idx];
+ return &counters[array_index_nospec(idx, num_counters)];
}
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -162,6 +172,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
return ret;
}
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, msr);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0);
+
+ return pmc;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -223,7 +245,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_CTRL:
if (pmu->global_ctrl == data)
return 0;
- if (!(data & pmu->global_ctrl_mask)) {
+ if (kvm_valid_perf_global_ctrl(pmu, data)) {
global_ctrl_changed(pmu, data);
return 0;
}
@@ -238,13 +260,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (msr_info->host_initiated)
- pmc->counter = data;
- else
- pmc->counter = (s32)data;
+ if (!msr_info->host_initiated)
+ data = (s64)(s32)data;
+ pmc->counter += data - pmc_read_counter(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
- pmc->counter = data;
+ pmc->counter += data - pmc_read_counter(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
@@ -262,6 +283,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
@@ -283,8 +305,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (!pmu->version)
return;
+ perf_get_x86_pmu_capability(&x86_pmu);
+
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- INTEL_PMC_MAX_GENERIC);
+ x86_pmu.num_counters_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -294,7 +318,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
} else {
pmu->nr_arch_fixed_counters =
min_t(int, edx.split.num_counters_fixed,
- INTEL_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
}
@@ -314,6 +338,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
(entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+
+ bitmap_set(pmu->all_valid_pmc_idx,
+ 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx,
+ INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
+
+ nested_vmx_pmu_entry_exit_ctls_update(vcpu);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
@@ -325,12 +356,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
}
for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].current_config = 0;
}
}
@@ -363,8 +396,9 @@ struct kvm_pmu_ops intel_pmu_ops = {
.find_fixed_event = intel_find_fixed_event,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index eb1ecd16fd22..cad128d1657b 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -23,12 +23,12 @@ BUILD_BUG_ON(1)
*
* When adding or removing fields here, note that shadowed
* fields must always be synced by prepare_vmcs02, not just
- * prepare_vmcs02_full.
+ * prepare_vmcs02_rare.
*/
/*
* Keeping the fields ordered by size is an attempt at improving
- * branch prediction in vmcs_read_any and vmcs_write_any.
+ * branch prediction in vmcs12_read_any and vmcs12_write_any.
*/
/* 16-bits */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 4010d519eb8c..81ada2ce99e7 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -43,7 +43,7 @@
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
-ENTRY(vmx_vmenter)
+SYM_FUNC_START(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
@@ -65,7 +65,7 @@ ENTRY(vmx_vmenter)
_ASM_EXTABLE(1b, 5b)
_ASM_EXTABLE(2b, 5b)
-ENDPROC(vmx_vmenter)
+SYM_FUNC_END(vmx_vmenter)
/**
* vmx_vmexit - Handle a VMX VM-Exit
@@ -77,7 +77,7 @@ ENDPROC(vmx_vmenter)
* here after hardware loads the host's state, i.e. this is the destination
* referred to by VMCS.HOST_RIP.
*/
-ENTRY(vmx_vmexit)
+SYM_FUNC_START(vmx_vmexit)
#ifdef CONFIG_RETPOLINE
ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
/* Preserve guest's RAX, it's used to stuff the RSB. */
@@ -90,18 +90,18 @@ ENTRY(vmx_vmexit)
.Lvmexit_skip_rsb:
#endif
ret
-ENDPROC(vmx_vmexit)
+SYM_FUNC_END(vmx_vmexit)
/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
- * @vmx: struct vcpu_vmx *
+ * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp)
* @regs: unsigned long * (to guest registers)
* @launched: %true if the VMCS has been launched
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
*/
-ENTRY(__vmx_vcpu_run)
+SYM_FUNC_START(__vmx_vcpu_run)
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP
#ifdef CONFIG_X86_64
@@ -151,7 +151,7 @@ ENTRY(__vmx_vcpu_run)
mov VCPU_R14(%_ASM_AX), %r14
mov VCPU_R15(%_ASM_AX), %r15
#endif
- /* Load guest RAX. This kills the vmx_vcpu pointer! */
+ /* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -233,4 +233,4 @@ ENTRY(__vmx_vcpu_run)
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
jmp 1b
-ENDPROC(__vmx_vcpu_run)
+SYM_FUNC_END(__vmx_vcpu_run)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 42ed3faa6af8..3be25ecae145 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -106,8 +106,6 @@ module_param(enable_apicv, bool, S_IRUGO);
static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
-static u64 __read_mostly host_xss;
-
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
@@ -209,6 +207,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
struct page *page;
unsigned int i;
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
@@ -343,6 +346,48 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
void vmx_vmexit(void);
+#define vmx_insn_failed(fmt...) \
+do { \
+ WARN_ONCE(1, fmt); \
+ pr_warn_ratelimited(fmt); \
+} while (0)
+
+asmlinkage void vmread_error(unsigned long field, bool fault)
+{
+ if (fault)
+ kvm_spurious_fault();
+ else
+ vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+}
+
+noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+}
+
+noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+}
+
+noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
+{
+ vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ ext, vpid, gva);
+}
+
+noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ ext, eptp, gpa);
+}
+
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
@@ -403,6 +448,7 @@ const u32 vmx_msr_index[] = {
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
};
#if IS_ENABLED(CONFIG_HYPERV)
@@ -486,6 +532,31 @@ static int hv_remote_flush_tlb(struct kvm *kvm)
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
+static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightened_vmcs *evmcs;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &vcpu->kvm->arch.hyperv.hv_pa_pg;
+ /*
+ * Synthetic VM-Exit is not enabled in current code and so All
+ * evmcs in singe VM shares same assist page.
+ */
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+ evmcs->partition_assist_page =
+ __pa(*p_hv_pa_pg);
+ evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
+ evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+ return 0;
+}
+
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -566,6 +637,23 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
return NULL;
}
+static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+{
+ int ret = 0;
+
+ u64 old_msr_data = msr->data;
+ msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ preempt_disable();
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
+ preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
+ }
+ return ret;
+}
+
void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
vmcs_clear(loaded_vmcs->vmcs);
@@ -654,8 +742,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
- if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
- vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
@@ -763,7 +851,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
-static int find_msr(struct vmx_msrs *m, unsigned int msr)
+int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
@@ -797,7 +885,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_msr_index(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
@@ -805,7 +893,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = find_msr(&m->host, msr);
+ i = vmx_find_msr_index(&m->host, msr);
if (i < 0)
return;
@@ -864,12 +952,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_msr_index(&m->guest, msr);
if (!entry_only)
- j = find_msr(&m->host, msr);
+ j = vmx_find_msr_index(&m->host, msr);
- if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
- (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
+ if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -897,17 +985,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -977,6 +1057,12 @@ static unsigned long segment_base(u16 selector)
}
#endif
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return (pt_mode == PT_MODE_HOST_GUEST) &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
@@ -1204,6 +1290,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
+ /*
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+ * correctly.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ goto after_clear_sn;
+ }
+
/* The full case. */
do {
old.control = new.control = pi_desc->control;
@@ -1219,6 +1317,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
+after_clear_sn:
+
/*
* Clear SN before reading the bitmap. The VT-d firmware
* writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1227,7 +1327,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
*/
smp_mb__after_atomic();
- if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+ if (!pi_is_pir_empty(pi_desc))
pi_set_on(pi_desc);
}
@@ -1274,14 +1374,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- /*
- * VM exits change the host TR limit to 0x67 after a VM
- * exit. This is okay, since 0x67 covers everything except
- * the IO bitmap and have have code to handle the IO bitmap
- * being lost after a VM exit.
- */
- BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
-
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -1336,39 +1428,46 @@ static bool emulation_required(struct kvm_vcpu *vcpu)
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ save_rflags = vmx->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
- to_vmx(vcpu)->rflags = rflags;
+ vmx->rflags = rflags;
}
- return to_vmx(vcpu)->rflags;
+ return vmx->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- unsigned long old_rflags = vmx_get_rflags(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->rflags = rflags;
- if (to_vmx(vcpu)->rmode.vm86_active) {
- to_vmx(vcpu)->rmode.save_rflags = rflags;
+ if (enable_unrestricted_guest) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
- if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
- to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->emulation_required = emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1472,17 +1571,32 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
- rip = kvm_rip_read(vcpu);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- kvm_rip_write(vcpu, rip);
+ /*
+ * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+ * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+ * set when EPT misconfig occurs. In practice, real hardware updates
+ * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+ * (namely Hyper-V) don't set it due to it being undefined behavior,
+ * i.e. we end up advancing IP with some random value.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+ rip = kvm_rip_read(vcpu);
+ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_rip_write(vcpu, rip);
+ } else {
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ }
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
@@ -1517,8 +1631,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
int inc_eip = 0;
if (kvm_exception_is_soft(nr))
inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
return;
}
@@ -1591,6 +1704,9 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_TSC_AUX);
if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
vmx->guest_msrs_ready = false;
@@ -1604,7 +1720,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
@@ -1622,7 +1738,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1661,8 +1777,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
default:
return 1;
}
-
- return 0;
}
/*
@@ -1690,6 +1804,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_shared_msr;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ msr_info->data = vmx->msr_ia32_umwait_control;
+ break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -1716,25 +1841,29 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE))
+ FEAT_CTL_LMCE_ENABLED))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
- &msr_info->data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
return 1;
- msr_info->data = vcpu->arch.ia32_xss;
+ /*
+ * Enlightened VMCS v1 doesn't have certain fields, but buggy
+ * Hyper-V versions are still trying to use corresponding
+ * features when they are exposed. Filter out the essential
+ * minimum.
+ */
+ if (!msr_info->host_initiated &&
+ vmx->nested.enlightened_vmcs_enabled)
+ nested_evmcs_filter_control_msr(msr_info->index,
+ &msr_info->data);
break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
@@ -1786,8 +1915,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- /* Else, falls through */
+ goto find_shared_msr;
default:
+ find_shared_msr:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
@@ -1800,7 +1930,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
/*
- * Writes msr value into into the appropriate "register".
+ * Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1863,17 +1993,25 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ /* The reserved bit 1 and non-32 bit [63:32] should be zero */
+ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
+ return 1;
+
+ vmx->msr_ia32_umwait_control = data;
+ break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
return 1;
vmx->spec_ctrl = data;
-
if (!data)
break;
@@ -1884,7 +2022,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
@@ -1893,6 +2031,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
+ return 1;
+ goto find_shared_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -1900,7 +2045,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+ return 1;
if (!data)
break;
@@ -1913,7 +2059,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
@@ -1941,15 +2087,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE)) ||
+ FEAT_CTL_LMCE_ENABLED)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
if (!vmx_feature_control_msr_valid(vcpu, data) ||
(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ FEAT_CTL_LOCKED && !msr_info->host_initiated))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
@@ -1961,25 +2107,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
- return 1;
- /*
- * The only supported bit as of Skylake is bit 8, but
- * it is not supported on KVM.
- */
- if (data != 0)
- return 1;
- vcpu->arch.ia32_xss = data;
- if (vcpu->arch.ia32_xss != host_xss)
- add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
- break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
@@ -1990,47 +2117,50 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pt_update_intercept_for_msr(vmx);
break;
case MSR_IA32_RTIT_STATUS:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (data & MSR_IA32_RTIT_STATUS_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_cr3_filtering))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)) ||
- (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges))
+ return 1;
+ if (is_noncanonical_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
@@ -2044,23 +2174,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- /* Else, falls through */
+ goto find_shared_msr;
+
default:
+ find_shared_msr:
msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- u64 old_msr_data = msr->data;
- msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
- preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
- preempt_enable();
- if (ret)
- msr->data = old_msr_data;
- }
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
+ if (msr)
+ ret = vmx_set_guest_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
@@ -2068,7 +2190,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
@@ -2080,7 +2203,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
+ case VCPU_EXREG_CR3:
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
default:
+ WARN_ON_ONCE(1);
break;
}
}
@@ -2092,29 +2220,8 @@ static __init int cpu_has_kvm_support(void)
static __init int vmx_disabled_by_bios(void)
{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- /* launched w/ TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && tboot_enabled())
- return 1;
- /* launched w/o TXT and VMX only enabled w/ TXT */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && !tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- "activate TXT before enabling KVM\n");
- return 1;
- }
- /* launched w/o TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
- return 1;
- }
-
- return 0;
+ return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX);
}
static void kvm_cpu_vmxon(u64 addr)
@@ -2129,7 +2236,6 @@ static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
@@ -2157,17 +2263,6 @@ static int hardware_enable(void)
*/
crash_enable_local_vmclear(cpu);
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
- if ((old & test_bits) != test_bits) {
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
- }
kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
@@ -2243,7 +2338,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
@@ -2280,6 +2375,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
@@ -2577,8 +2673,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_segment_cache_clear(vmx);
-
vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
@@ -2750,13 +2844,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
-{
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-}
-
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -2769,8 +2856,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty))
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
return;
if (is_pae_paging(vcpu)) {
@@ -2792,10 +2878,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -2804,8 +2887,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- vmx_decache_cr3(vcpu);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
@@ -2864,6 +2947,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static int get_ept_level(struct kvm_vcpu *vcpu)
{
+ /* Nested EPT currently only supports 4-level walks. */
+ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+ return 4;
if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
return 5;
return 4;
@@ -2886,6 +2972,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
@@ -2902,15 +2989,20 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
- if (enable_unrestricted_guest || is_paging(vcpu) ||
- is_guest_mode(vcpu))
- guest_cr3 = kvm_read_cr3(vcpu);
- else
+ /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
+ if (is_guest_mode(vcpu))
+ update_guest_cr3 = false;
+ else if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
ept_load_pdptrs(vcpu);
}
- vmcs_writel(GUEST_CR3, guest_cr3);
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
}
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -3369,7 +3461,7 @@ out:
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- int i, idx, r = 0;
+ int i, r = 0;
kvm_pfn_t identity_map_pfn;
u32 tmp;
@@ -3377,7 +3469,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
mutex_lock(&kvm->slots_lock);
if (likely(kvm_vmx->ept_identity_pagetable_done))
- goto out2;
+ goto out;
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
@@ -3386,9 +3478,8 @@ static int init_rmode_identity_map(struct kvm *kvm)
r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
- goto out2;
+ goto out;
- idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -3404,9 +3495,6 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_vmx->ept_identity_pagetable_done = true;
out:
- srcu_read_unlock(&kvm->srcu, idx);
-
-out2:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -3644,11 +3732,6 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
}
}
-static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
-{
- return enable_apicv;
-}
-
static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3934,9 +4017,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_xsaves_supported()) {
/* Exposing XSAVES only when XSAVE is exposed */
bool xsaves_enabled =
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+ vcpu->arch.xsaves_enabled = xsaves_enabled;
+
if (!xsaves_enabled)
exec_control &= ~SECONDARY_EXEC_XSAVES;
@@ -4016,6 +4102,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
}
+ if (vmx_waitpkg_supported()) {
+ bool waitpkg_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
+
+ if (!waitpkg_enabled)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+
+ if (nested) {
+ if (waitpkg_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+ }
+ }
+
vmx->secondary_exec_control = exec_control;
}
@@ -4026,20 +4129,19 @@ static void ept_set_mmio_spte_mask(void)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
- VMX_EPT_MISCONFIG_WX_VALUE);
+ VMX_EPT_MISCONFIG_WX_VALUE, 0);
}
#define VMX_XSS_EXIT_BITMAP 0
/*
- * Sets up the vmcs for emulated real mode.
+ * Noting that the initialization of Guest-state Area of VMCS is in
+ * vmx_vcpu_reset().
*/
-static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void init_vmcs(struct vcpu_vmx *vmx)
{
- int i;
-
if (nested)
- nested_vmx_vcpu_setup();
+ nested_vmx_set_vmcs_shadowing_bitmap();
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@ -4048,7 +4150,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- vmx->hv_deadline_tsc = -1;
exec_controls_set(vmx, vmx_exec_control(vmx));
@@ -4097,21 +4198,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
- }
-
vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -4122,6 +4208,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
set_cr4_guest_host_mask(vmx);
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
if (vmx_xsaves_supported())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
@@ -4150,8 +4239,10 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
- vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_umwait_control = 0;
+
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
if (!init_event) {
@@ -4221,9 +4312,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr0(vcpu, cr0); /* enter rmode */
@@ -4239,7 +4327,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4250,7 +4338,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4266,8 +4354,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
int inc_eip = 0;
if (vcpu->arch.interrupt.soft)
inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
@@ -4303,8 +4390,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
return;
}
@@ -4377,8 +4463,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
if (enable_unrestricted_guest)
return 0;
- ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
- PAGE_SIZE * 3);
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
if (ret)
return ret;
to_kvm_vmx(kvm)->tss_addr = addr;
@@ -4431,7 +4520,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
- if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
return kvm_vcpu_halt(vcpu);
@@ -4482,7 +4571,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
u32 intr_info, ex_no, error_code;
unsigned long cr2, rip, dr6;
u32 vect_info;
- enum emulation_result er;
vect_info = vmx->idt_vectoring_info;
intr_info = vmx->exit_intr_info;
@@ -4499,13 +4587,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = kvm_emulate_instruction(vcpu,
- EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
- if (er == EMULATE_USER_EXIT)
- return 0;
- else if (er != EMULATE_DONE)
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+ * error code on #GP.
+ */
+ if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
}
/*
@@ -4547,7 +4639,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
vcpu->arch.dr6 |= dr6 | DR6_RTM;
if (is_icebp(intr_info))
- skip_emulated_instruction(vcpu);
+ WARN_ON(!skip_emulated_instruction(vcpu));
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
@@ -4577,7 +4669,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
@@ -4602,7 +4694,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string)
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -4676,7 +4768,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
}
static int handle_cr(struct kvm_vcpu *vcpu)
@@ -4849,50 +4941,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
vmcs_writel(GUEST_DR7, val);
}
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_cpuid(vcpu);
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- u32 ecx = kvm_rcx_read(vcpu);
- struct msr_data msr_info;
-
- msr_info.index = ecx;
- msr_info.host_initiated = false;
- if (vmx_get_msr(vcpu, &msr_info)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- trace_kvm_msr_read(ecx, msr_info.data);
-
- kvm_rax_write(vcpu, msr_info.data & -1u);
- kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u);
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
-{
- struct msr_data msr;
- u32 ecx = kvm_rcx_read(vcpu);
- u64 data = kvm_read_edx_eax(vcpu);
-
- msr.data = data;
- msr.index = ecx;
- msr.host_initiated = false;
- if (kvm_set_msr(vcpu, &msr) != 0) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(vcpu);
-}
-
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
kvm_apic_update_ppr(vcpu);
@@ -4901,7 +4949,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -4909,11 +4957,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_halt(vcpu);
-}
-
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
return kvm_emulate_hypercall(vcpu);
@@ -4921,7 +4964,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -4955,20 +4998,6 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_xsaves(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN(1, "this should never happen\n");
- return 1;
-}
-
-static int handle_xrstors(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN(1, "this should never happen\n");
- return 1;
-}
-
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
@@ -4988,7 +5017,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
}
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
}
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -5057,23 +5086,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
type != INTR_TYPE_EXT_INTR &&
type != INTR_TYPE_NMI_INTR))
- skip_emulated_instruction(vcpu);
-
- if (kvm_task_switch(vcpu, tss_selector,
- type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
+ WARN_ON(!skip_emulated_instruction(vcpu));
/*
* TODO: What about debug traps on tss switch?
* Are we supposed to inject them and update dr6?
*/
-
- return 1;
+ return kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+ reason, has_error_code, error_code);
}
static int handle_ept_violation(struct kvm_vcpu *vcpu)
@@ -5132,21 +5153,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu) &&
!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
trace_kvm_fast_mmio(gpa);
- /*
- * Doing kvm_skip_emulated_instruction() depends on undefined
- * behavior: Intel's manual doesn't mandate
- * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
- * occurs and while on real hardware it was observed to be set,
- * other hypervisors (namely Hyper-V) don't set it, we end up
- * advancing IP with some random value. Disable fast mmio when
- * running nested and keep it for real hardware in hope that
- * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
- */
- if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
- return kvm_skip_emulated_instruction(vcpu);
- else
- return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
- EMULATE_DONE;
+ return kvm_skip_emulated_instruction(vcpu);
}
return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -5155,7 +5162,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
WARN_ON_ONCE(!enable_vnmi);
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5165,8 +5172,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- enum emulation_result err = EMULATE_DONE;
- int ret = 1;
bool intr_window_requested;
unsigned count = 130;
@@ -5178,7 +5183,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
intr_window_requested = exec_controls_get(vmx) &
- CPU_BASED_VIRTUAL_INTR_PENDING;
+ CPU_BASED_INTR_WINDOW_EXITING;
while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
@@ -5187,71 +5192,67 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
- err = kvm_emulate_instruction(vcpu, 0);
-
- if (err == EMULATE_USER_EXIT) {
- ++vcpu->stat.mmio_exits;
- ret = 0;
- goto out;
- }
-
- if (err != EMULATE_DONE)
- goto emulation_error;
+ if (!kvm_emulate_instruction(vcpu, 0))
+ return 0;
if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending)
- goto emulation_error;
+ vcpu->arch.exception.pending) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- ret = kvm_vcpu_halt(vcpu);
- goto out;
+ return kvm_vcpu_halt(vcpu);
}
+ /*
+ * Note, return 1 and not 0, vcpu_run() is responsible for
+ * morphing the pending signal into the proper return code.
+ */
if (signal_pending(current))
- goto out;
+ return 1;
+
if (need_resched())
schedule();
}
-out:
- return ret;
-
-emulation_error:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
+ return 1;
}
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int old = vmx->ple_window;
+ unsigned int old = vmx->ple_window;
vmx->ple_window = __grow_ple_window(old, ple_window,
ple_window_grow,
ple_window_max);
- if (vmx->ple_window != old)
+ if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
-
- trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
}
static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int old = vmx->ple_window;
+ unsigned int old = vmx->ple_window;
vmx->ple_window = __shrink_ple_window(old, ple_window,
ple_window_shrink,
ple_window);
- if (vmx->ple_window != old)
+ if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
-
- trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
}
/*
@@ -5503,11 +5504,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
@@ -5541,8 +5542,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
[EXIT_REASON_RDRAND] = handle_invalid_op,
[EXIT_REASON_RDSEED] = handle_invalid_op,
- [EXIT_REASON_XSAVES] = handle_xsaves,
- [EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
@@ -5795,7 +5794,8 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
@@ -5881,15 +5881,44 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason);
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
return 1;
}
+
+ if (exit_reason >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
+#ifdef CONFIG_RETPOLINE
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
+
+ exit_reason = array_index_nospec(exit_reason,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_reason])
+ goto unexpected_vmexit;
+
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+
+unexpected_vmexit:
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ dump_vmcs();
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.data[0] = exit_reason;
+ return 0;
}
/*
@@ -5965,17 +5994,17 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
if (is_guest_mode(vcpu) &&
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
return;
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
}
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -6089,7 +6118,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
- * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
@@ -6119,7 +6148,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_test_on(vcpu_to_pi_desc(vcpu));
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
}
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -6207,7 +6239,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
}
STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion *exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6215,6 +6248,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
+ else if (!is_guest_mode(vcpu) &&
+ vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+ *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static bool vmx_has_emulated_msr(int index)
@@ -6373,6 +6409,23 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
+static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
+{
+ u32 host_umwait_control;
+
+ if (!vmx_has_waitpkg(vmx))
+ return;
+
+ host_umwait_control = get_umwait_control_msr();
+
+ if (vmx->msr_ia32_umwait_control != host_umwait_control)
+ add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
+ vmx->msr_ia32_umwait_control,
+ host_umwait_control, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
+}
+
static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6432,9 +6485,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr3 = __get_current_cr3_fast();
@@ -6457,7 +6510,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xcr0(vcpu);
+ kvm_load_guest_xsave_state(vcpu);
if (static_cpu_has(X86_FEATURE_PKU) &&
kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
@@ -6467,6 +6520,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
+ atomic_switch_umwait_control_msr(vmx);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -6522,6 +6576,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
@@ -6560,7 +6617,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vmx->host_pkru);
}
- kvm_put_guest_xcr0(vcpu);
+ kvm_load_host_xsave_state(vcpu);
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
@@ -6589,6 +6646,7 @@ static struct kvm *vmx_vm_alloc(void)
static void vmx_vm_free(struct kvm *kvm)
{
+ kfree(kvm->arch.hyperv.hv_pa_pg);
vfree(to_kvm_vmx(kvm));
}
@@ -6601,70 +6659,66 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
- int err;
struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
- int cpu;
-
- vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
- if (!vmx)
- return ERR_PTR(-ENOMEM);
+ int i, cpu, err;
- vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.user_fpu) {
- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
- err = -ENOMEM;
- goto free_partial_vcpu;
- }
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_user_fpu;
- }
+ err = -ENOMEM;
vmx->vpid = allocate_vpid();
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
-
- err = -ENOMEM;
-
/*
* If PML is turned on, failure on enabling PML just results in failure
* of creating the vcpu, therefore we can simplify PML logic (by
* avoiding dealing with cases, such as enabling PML partially on vcpus
- * for the guest, etc.
+ * for the guest), etc.
*/
if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
- goto uninit_vcpu;
+ goto free_vpid;
}
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
- > PAGE_SIZE);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
- if (!vmx->guest_msrs)
- goto free_pml;
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ switch (index) {
+ case MSR_IA32_TSX_CTRL:
+ /*
+ * No need to pass TSX_CTRL_CPUID_CLEAR through, so
+ * let's avoid changing CPUID bits under the host
+ * kernel's feet.
+ */
+ vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ break;
+ default:
+ vmx->guest_msrs[j].mask = -1ull;
+ break;
+ }
+ ++vmx->nmsrs;
+ }
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
- goto free_msrs;
+ goto free_pml;
msr_bitmap = vmx->vmcs01.msr_bitmap;
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
@@ -6674,7 +6728,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(kvm)) {
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
@@ -6684,19 +6738,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
- vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ init_vmcs(vmx);
+ vmx_vcpu_put(vcpu);
put_cpu();
- if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- err = alloc_apic_access_page(kvm);
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (enable_ept && !enable_unrestricted_guest) {
- err = init_rmode_identity_map(kvm);
+ err = init_rmode_identity_map(vcpu->kvm);
if (err)
goto free_vmcs;
}
@@ -6704,14 +6758,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (nested)
nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
vmx_capability.ept,
- kvm_vcpu_apicv_active(&vmx->vcpu));
+ kvm_vcpu_apicv_active(vcpu));
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
/*
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
@@ -6722,24 +6777,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->ept_pointer = INVALID_PAGE;
- return &vmx->vcpu;
+ return 0;
free_vmcs:
free_loaded_vmcs(vmx->loaded_vmcs);
-free_msrs:
- kfree(vmx->guest_msrs);
free_pml:
vmx_destroy_pml_buffer(vmx);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
+free_vpid:
free_vpid(vmx->vpid);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-free_partial_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
+ return err;
}
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
@@ -6775,6 +6821,7 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
+ kvm_apicv_init(kvm, enable_apicv);
return 0;
}
@@ -6783,6 +6830,12 @@ static int __init vmx_check_processor_compat(void)
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
+ return -EIO;
+ }
+
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
@@ -6885,27 +6938,28 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
} while (0)
entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
- cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
- cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
- cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
- cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
- cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
- cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
- cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
- cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
- cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
- cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
- cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
- cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
- cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
- cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
- cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
- cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
- cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
#undef cr4_fixed1_update
}
@@ -7000,6 +7054,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
+ vcpu->arch.xsaves_enabled = false;
+
if (cpu_has_secondary_exec_ctrls()) {
vmx_compute_secondary_exec_control(vmx);
vmcs_set_secondary_exec_control(vmx);
@@ -7007,10 +7064,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
@@ -7020,12 +7079,21 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct shared_msr_entry *msr;
+ msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
if (func == 1 && nested)
- entry->ecx |= bit(X86_FEATURE_VMX);
+ entry->ecx |= feature_bit(VMX);
}
static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7369,10 +7437,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* irqbalance to make the interrupts single-CPU.
*
* We will support full lowest-priority interrupt later.
+ *
+ * In addition, we can only inject generic interrupts using
+ * the PI mechanism, refuse to route others through it.
*/
kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
/*
* Make sure the IRTE is in remapped mode if
* we don't handle it in posted mode.
@@ -7416,10 +7488,10 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_LMCE;
+ FEAT_CTL_LMCE_ENABLED;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_LMCE;
+ ~FEAT_CTL_LMCE_ENABLED;
}
static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
@@ -7474,6 +7546,11 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
return false;
}
+static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.vmxon;
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
@@ -7499,9 +7576,6 @@ static __init int hardware_setup(void)
WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
}
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
@@ -7649,6 +7723,14 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
+static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV);
+
+ return supported & BIT(bit);
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -7682,7 +7764,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -7722,10 +7803,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .get_enable_apicv = vmx_get_enable_apicv,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@ -7762,6 +7843,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+ .pku_supported = vmx_pku_supported,
.request_immediate_exit = vmx_request_immediate_exit,
@@ -7797,7 +7879,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
};
static void vmx_cleanup_l1d_flush(void)
@@ -7834,6 +7918,7 @@ static void vmx_exit(void)
if (!vp_ap)
continue;
+ vp_ap->nested_control.features.directhypercall = 0;
vp_ap->current_nested_vmcs = 0;
vp_ap->enlighten_vmentry = 0;
}
@@ -7873,6 +7958,11 @@ static int __init vmx_init(void)
pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
static_branch_enable(&enable_evmcs);
}
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_direct_tlbflush
+ = hv_enable_direct_tlbflush;
+
} else {
enlightened_vmcs = false;
}
@@ -7890,12 +7980,10 @@ static int __init vmx_init(void)
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
- if (boot_cpu_has(X86_BUG_L1TF)) {
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
}
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 82d0bc3a4d52..7f42cf3dcd70 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -14,17 +14,25 @@
extern const u32 vmx_msr_index[];
extern u64 host_efer;
+extern u32 get_umwait_control_msr(void);
+
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
#define MSR_TYPE_RW 3
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-#define NR_AUTOLOAD_MSRS 8
+#ifdef CONFIG_X86_64
+#define NR_SHARED_MSRS 7
+#else
+#define NR_SHARED_MSRS 4
+#endif
+
+#define NR_LOADSTORE_MSRS 8
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry val[NR_LOADSTORE_MSRS];
};
struct shared_msr_entry {
@@ -165,6 +173,9 @@ struct nested_vmx {
u64 vmcs01_debugctl;
u64 vmcs01_guest_bndcfgs;
+ /* to migrate it to L1 if L2 writes to L1's CR8 directly */
+ int l1_tpr_threshold;
+
u16 vpid02;
u16 last_vpid;
@@ -201,7 +212,7 @@ struct vcpu_vmx {
u32 idt_vectoring_info;
ulong rflags;
- struct shared_msr_entry *guest_msrs;
+ struct shared_msr_entry guest_msrs[NR_SHARED_MSRS];
int nmsrs;
int save_nmsrs;
bool guest_msrs_ready;
@@ -211,6 +222,7 @@ struct vcpu_vmx {
#endif
u64 spec_ctrl;
+ u32 msr_ia32_umwait_control;
u32 secondary_exec_control;
@@ -227,6 +239,10 @@ struct vcpu_vmx {
struct vmx_msrs host;
} msr_autoload;
+ struct msr_autostore {
+ struct vmx_msrs guest;
+ } msr_autostore;
+
struct {
int vm86_active;
ulong save_rflags;
@@ -253,7 +269,7 @@ struct vcpu_vmx {
struct nested_vmx nested;
/* Dynamic PLE window. */
- int ple_window;
+ unsigned int ple_window;
bool ple_window_dirty;
bool req_immediate_exit;
@@ -273,7 +289,7 @@ struct vcpu_vmx {
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
- * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
* in msr_ia32_feature_control_valid_bits.
*/
u64 msr_ia32_feature_control;
@@ -331,6 +347,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
@@ -352,6 +369,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_SN,
@@ -370,6 +392,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
static inline int pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
@@ -497,6 +525,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
}
+static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+{
+ return vmx->secondary_exec_control &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 93b0bd45ac73..fb5d64ebc35d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
+#include "lapic.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -68,6 +69,7 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/intel_pt.h>
+#include <asm/emulate_prefix.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
@@ -92,8 +94,10 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -176,6 +180,8 @@ struct kvm_shared_msrs {
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
static struct kvm_shared_msrs __percpu *shared_msrs;
+static u64 __read_mostly host_xss;
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -212,7 +218,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages) },
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
{ "max_mmu_page_hash_collisions",
VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
@@ -259,23 +266,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
}
}
-static void shared_msr_update(unsigned slot, u32 msr)
-{
- u64 value;
- unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
-
- /* only read, and nobody should modify it at this time,
- * so don't need lock */
- if (slot >= shared_msrs_global.nr) {
- printk(KERN_ERR "kvm: invalid MSR slot!");
- return;
- }
- rdmsrl_safe(msr, &value);
- smsr->values[slot].host = value;
- smsr->values[slot].curr = value;
-}
-
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
BUG_ON(slot >= KVM_NR_SHARED_MSRS);
@@ -287,10 +277,16 @@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
- unsigned i;
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ u64 value;
+ int i;
- for (i = 0; i < shared_msrs_global.nr; ++i)
- shared_msr_update(i, shared_msrs_global.msrs[i]);
+ for (i = 0; i < shared_msrs_global.nr; ++i) {
+ rdmsrl_safe(shared_msrs_global.msrs[i], &value);
+ smsr->values[i].host = value;
+ smsr->values[i].curr = value;
+ }
}
int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
@@ -299,13 +295,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -360,7 +357,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base);
asmlinkage __visible void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
- BUG();
+ BUG_ON(!kvm_rebooting);
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
@@ -441,6 +438,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
* for #DB exceptions under VMX.
*/
vcpu->arch.dr6 ^= payload & DR6_RTM;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
vcpu->arch.cr2 = payload;
@@ -493,19 +498,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
- /*
- * In guest mode, payload delivery should be deferred,
- * so that the L1 hypervisor can intercept #PF before
- * CR2 is modified (or intercept #DB before DR6 is
- * modified under nVMX). However, for ABI
- * compatibility with KVM_GET_VCPU_EVENTS and
- * KVM_SET_VCPU_EVENTS, we can't delay payload
- * delivery unless userspace has enabled this
- * functionality via the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD.
- */
- if (!vcpu->kvm->arch.exception_payload_enabled ||
- !is_guest_mode(vcpu))
+ if (!is_guest_mode(vcpu))
kvm_deliver_exception_payload(vcpu);
return;
}
@@ -674,8 +667,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
data, offset, len, access);
}
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+ rsvd_bits(1, 2);
+}
+
/*
- * Load the pae pdptrs. Return true is they are all valid.
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
@@ -694,8 +693,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
- (pdpte[i] &
- vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) {
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
ret = 0;
goto out;
}
@@ -703,10 +701,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
out:
return ret;
@@ -716,7 +712,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- bool changed = true;
int offset;
gfn_t gfn;
int r;
@@ -724,8 +719,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (!is_pae_paging(vcpu))
return false;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
return true;
gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@ -733,11 +727,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
+ return true;
- return changed;
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
@@ -806,27 +798,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
- if (vcpu->guest_xcr0_loaded) {
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, host_xss);
}
+
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
@@ -879,34 +878,58 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+#define __cr4_reserved_bits(__cpu_has, __c) \
+({ \
+ u64 __reserved_bits = CR4_RESERVED_BITS; \
+ \
+ if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
+ __reserved_bits |= X86_CR4_OSXSAVE; \
+ if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
+ __reserved_bits |= X86_CR4_SMEP; \
+ if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
+ __reserved_bits |= X86_CR4_SMAP; \
+ if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
+ __reserved_bits |= X86_CR4_FSGSBASE; \
+ if (!__cpu_has(__c, X86_FEATURE_PKU)) \
+ __reserved_bits |= X86_CR4_PKE; \
+ if (!__cpu_has(__c, X86_FEATURE_LA57)) \
+ __reserved_bits |= X86_CR4_LA57; \
+ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
+ __reserved_bits |= X86_CR4_UMIP; \
+ __reserved_bits; \
+})
+
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+ u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
- if (cr4 & CR4_RESERVED_BITS)
- return 1;
+ if (cpuid_ecx(0x7) & feature_bit(LA57))
+ reserved_bits &= ~X86_CR4_LA57;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
+ if (kvm_x86_ops->umip_emulated())
+ reserved_bits &= ~X86_CR4_UMIP;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return 1;
+ return reserved_bits;
+}
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return 1;
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & cr4_reserved_bits)
+ return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return 1;
+ if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
+ return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return 1;
+ return 0;
+}
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
- return 1;
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ if (kvm_valid_cr4(vcpu, cr4))
return 1;
if (is_long_mode(vcpu)) {
@@ -970,7 +993,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
return 0;
}
@@ -1039,9 +1062,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -1056,7 +1081,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 5:
/* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr7_valid(val))
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -1078,9 +1103,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1119,20 +1146,22 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
- * This list is modified at module load time to reflect the
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
+ * extract the supported MSRs from the related const lists.
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
* may depend on host virtualization features rather than host cpu features.
*/
-static u32 msrs_to_save[] = {
+static const u32 msrs_to_save_all[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
@@ -1140,11 +1169,36 @@ static u32 msrs_to_save[] = {
MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
};
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
static unsigned num_msrs_to_save;
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs_all[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1177,13 +1231,14 @@ static u32 emulated_msrs[] = {
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
/*
* The following list leaves out MSRs whose values are determined
* by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
* We always support the "true" VMX control MSRs, even if the host
* processor does not, so I am putting these registers here rather
- * than in msrs_to_save.
+ * than in msrs_to_save_all.
*/
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1202,13 +1257,14 @@ static u32 emulated_msrs[] = {
MSR_KVM_POLL_CONTROL,
};
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
* List of msr numbers which are used to expose MSR-based features that
* can be used by a hypervisor to validate requested CPU features.
*/
-static u32 msr_based_features[] = {
+static const u32 msr_based_features_all[] = {
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
MSR_IA32_VMX_PINBASED_CTLS,
@@ -1233,6 +1289,7 @@ static u32 msr_based_features[] = {
MSR_IA32_ARCH_CAPABILITIES,
};
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;
static u64 kvm_get_arch_capabilities(void)
@@ -1243,6 +1300,14 @@ static u64 kvm_get_arch_capabilities(void)
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
/*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
@@ -1254,6 +1319,24 @@ static u64 kvm_get_arch_capabilities(void)
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems:
+ * - nothing to do if TSX is disabled on the host.
+ * - we emulate TSX_CTRL if present on the host.
+ * This lets the guest use VERW to clear CPU buffers.
+ */
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+
return data;
}
@@ -1351,19 +1434,23 @@ void kvm_enable_efer_bits(u64 mask)
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
/*
- * Writes msr value into into the appropriate "register".
+ * Write @data into the MSR specified by @index. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
+ bool host_initiated)
{
- switch (msr->index) {
+ struct msr_data msr;
+
+ switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
- if (is_noncanonical_address(msr->data, vcpu))
+ if (is_noncanonical_address(data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
@@ -1380,54 +1467,158 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
}
- return kvm_x86_ops->set_msr(vcpu, msr);
+
+ msr.data = data;
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ return kvm_x86_ops->set_msr(vcpu, &msr);
}
-EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
- * Adapt set_msr() to msr_io()'s calling convention
+ * Read the MSR specified by @index into @data. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
*/
-static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
struct msr_data msr;
- int r;
+ int ret;
msr.index = index;
- msr.host_initiated = true;
- r = kvm_get_msr(vcpu, &msr);
- if (r)
- return r;
+ msr.host_initiated = host_initiated;
- *data = msr.data;
- return 0;
+ ret = kvm_x86_ops->get_msr(vcpu, &msr);
+ if (!ret)
+ *data = msr.data;
+ return ret;
}
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- struct msr_data msr;
+ return __kvm_get_msr(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
- msr.data = *data;
- msr.index = index;
- msr.host_initiated = true;
- return kvm_set_msr(vcpu, &msr);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return __kvm_set_msr(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data;
+
+ if (kvm_get_msr(vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_read(ecx, data);
+
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+
+ if (kvm_set_msr(vcpu, ecx, data)) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) &&
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+ }
+
+ return 1;
+}
+
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+ int ret = 0;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+ break;
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+
+ if (!ret) {
+ trace_kvm_msr_write(msr, data);
+ return EXIT_FASTPATH_SKIP_EMUL_INS;
+ }
+
+ return EXIT_FASTPATH_NONE;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return __kvm_get_msr(vcpu, index, data, true);
+}
+
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return __kvm_set_msr(vcpu, index, *data, true);
}
#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ u64 base_cycles;
+ u64 offset;
+};
+
struct pvclock_gtod_data {
seqcount_t seq;
- struct { /* extract of a clocksource struct */
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
- } clock;
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
- u64 boot_ns;
- u64 nsec_base;
+ ktime_t offs_boot;
u64 wall_time_sec;
};
@@ -1436,9 +1627,6 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns;
-
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
@@ -1448,14 +1636,35 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
- vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
vdata->wall_time_sec = tk->xtime_sec;
+ vdata->offs_boot = tk->offs_boot;
+
write_seqcount_end(&vdata->seq);
}
+
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
#endif
void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
@@ -1469,7 +1678,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec64 boot;
+ u64 wall_nsec;
if (!wall_clock)
return;
@@ -1489,17 +1698,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
+ * wall clock specified here. We do the reverse here.
*/
- getboottime64(&boot);
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) {
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
- boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
+ wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -1747,7 +1951,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boottime_ns();
+ ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -1877,21 +2081,21 @@ static u64 read_tsc(void)
return last;
}
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
{
long v;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
u64 tsc_pg_val;
- switch (gtod->clock.vclock_mode) {
+ switch (clock->vclock_mode) {
case VCLOCK_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
/* TSC page valid */
*mode = VCLOCK_HVCLOCK;
- v = (tsc_pg_val - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
} else {
/* TSC page invalid */
*mode = VCLOCK_NONE;
@@ -1900,8 +2104,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
case VCLOCK_TSC:
*mode = VCLOCK_TSC;
*tsc_timestamp = read_tsc();
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
break;
default:
*mode = VCLOCK_NONE;
@@ -1910,10 +2114,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
if (*mode == VCLOCK_NONE)
*tsc_timestamp = v = 0;
- return v * gtod->clock.mult;
+ return v * clock->mult;
}
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -1922,10 +2126,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
do {
seq = read_seqcount_begin(&gtod->seq);
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
- ns >>= gtod->clock.shift;
- ns += gtod->boot_ns;
+ ns = gtod->raw_clock.base_cycles;
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
*t = ns;
@@ -1942,8 +2146,8 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
do {
seq = read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -1960,7 +2164,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
tsc_timestamp));
}
@@ -2085,7 +2289,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boottime_ns() + ka->kvmclock_offset;
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2101,7 +2305,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2200,7 +2404,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boottime_ns();
+ kernel_ns = get_kvmclock_base_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -2240,6 +2444,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
+ WARN_ON((s64)vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
@@ -2346,7 +2551,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2431,6 +2639,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.pv_time_enabled = false;
+ vcpu->arch.time = 0;
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
@@ -2441,43 +2650,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
return;
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st->preempted & KVM_VCPU_FLUSH_TLB);
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb(vcpu, false);
- if (vcpu->arch.st.steal.version & 1)
- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
+ vcpu->arch.st.preempted = 0;
- vcpu->arch.st.steal.version += 1;
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ st->version += 1;
smp_wmb();
- vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+ st->steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
-
smp_wmb();
- vcpu->arch.st.steal.version += 1;
+ st->version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -2580,6 +2793,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ /*
+ * We do support PT if kvm_x86_ops->pt_supported(), but we do
+ * not support IA32_XSS[bit 8]. Guests will have to use
+ * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+ * MSRs.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
return 1;
@@ -2594,8 +2821,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME: {
struct kvm_arch *ka = &vcpu->kvm->arch;
- kvmclock_reset(vcpu);
-
if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
@@ -2609,14 +2834,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
+ vcpu->arch.pv_time_enabled = false;
if (!(data & 1))
break;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = false;
- else
vcpu->arch.pv_time_enabled = true;
break;
@@ -2633,11 +2857,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS,
- sizeof(struct kvm_steal_time)))
- return 1;
-
vcpu->arch.st.msr_val = data;
if (!(data & KVM_MSR_ENABLED))
@@ -2748,18 +2967,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
-{
- return kvm_x86_ops->get_msr(vcpu, msr);
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr);
-
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
u64 data;
@@ -2785,7 +2992,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -2922,6 +3132,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -3106,7 +3322,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
- case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
case KVM_CAP_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
@@ -3183,6 +3398,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_x86_ops->get_nested_state ?
kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
break;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ r = kvm_x86_ops->enable_direct_tlbflush != NULL;
+ break;
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ r = kvm_x86_ops->nested_enable_evmcs != NULL;
+ break;
default:
break;
}
@@ -3306,10 +3527,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
@@ -3349,15 +3566,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+ if (vcpu->arch.st.preempted)
+ return;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal.preempted,
- offsetof(struct kvm_steal_time, preempted),
- sizeof(vcpu->arch.st.steal.preempted));
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3506,8 +3733,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- if (kvm_x86_ops->setup_mce)
- kvm_x86_ops->setup_mce(vcpu);
+ kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
@@ -3566,6 +3792,21 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
process_nmi(vcpu);
/*
+ * In guest mode, payload delivery should be deferred,
+ * so that the L1 hypervisor can intercept #PF before
+ * CR2 is modified (or intercept #DB before DR6 is
+ * modified under nVMX). Unless the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
+ * opportunistically defer the exception payload, deliver it if the
+ * capability hasn't been requested before processing a
+ * KVM_GET_VCPU_EVENTS.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+ kvm_deliver_exception_payload(vcpu);
+
+ /*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
@@ -3695,12 +3936,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- }
+ }
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
}
@@ -3957,6 +4199,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
r = -EFAULT;
}
return r;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ if (!kvm_x86_ops->enable_direct_tlbflush)
+ return -ENOTTY;
+
+ return kvm_x86_ops->enable_direct_tlbflush(vcpu);
default:
return -EINVAL;
@@ -4286,6 +4533,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
struct kvm_nested_state kvm_state;
+ int idx;
r = -EINVAL;
if (!kvm_x86_ops->set_nested_state)
@@ -4309,7 +4557,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_GET_SUPPORTED_HV_CPUID: {
@@ -4500,9 +4750,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
{
struct kvm_pit *pit = kvm->arch.vpit;
- if (!pit)
- return -ENXIO;
-
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
@@ -4811,9 +5058,6 @@ set_identity_unlock:
if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
set_irqchip_out:
kfree(chip);
break;
@@ -4872,6 +5116,9 @@ set_identity_unlock:
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
@@ -4986,18 +5233,28 @@ out:
static void kvm_init_msr_list(void)
{
+ struct x86_pmu_capability x86_pmu;
u32 dummy[2];
- unsigned i, j;
+ unsigned i;
+
+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+ "Please update the fixed PMCs in msrs_to_saved_all[]");
+
+ perf_get_x86_pmu_capability(&x86_pmu);
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
continue;
/*
* Even MSRs that are valid in the host may not be exposed
* to the guests in some cases.
*/
- switch (msrs_to_save[i]) {
+ switch (msrs_to_save_all[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported())
continue;
@@ -5025,43 +5282,43 @@ static void kvm_init_msr_list(void)
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
if (!kvm_x86_ops->pt_supported() ||
- msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
}
default:
break;
}
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
}
- num_msrs_to_save = j;
- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
continue;
- if (j < i)
- emulated_msrs[j] = emulated_msrs[i];
- j++;
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- num_emulated_msrs = j;
- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
struct kvm_msr_entry msr;
- msr.index = msr_based_features[i];
+ msr.index = msr_based_features_all[i];
if (kvm_get_msr_feature(&msr))
continue;
- if (j < i)
- msr_based_features[j] = msr_based_features[i];
- j++;
+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
}
- num_msr_based_features = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -5312,6 +5569,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -5319,25 +5583,20 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
int handle_ud(struct kvm_vcpu *vcpu)
{
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
int emul_type = EMULTYPE_TRAP_UD;
- enum emulation_result er;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
- emul_type = 0;
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
}
- er = kvm_emulate_instruction(vcpu, emul_type);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
+ return kvm_emulate_instruction(vcpu, emul_type);
}
EXPORT_SYMBOL_GPL(handle_ud);
@@ -5370,7 +5629,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
*/
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.access, 0, access)) {
+ vcpu->arch.mmio_access, 0, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -5964,28 +6223,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- struct msr_data msr;
- int r;
-
- msr.index = msr_index;
- msr.host_initiated = false;
- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
- if (r)
- return r;
-
- *pdata = msr.data;
- return 0;
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- struct msr_data msr;
-
- msr.data = data;
- msr.index = msr_index;
- msr.host_initiated = false;
- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
+ return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -6005,7 +6249,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -6032,6 +6276,21 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read(emul_to_vcpu(ctxt), reg);
@@ -6068,6 +6327,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
kvm_smm_changed(emul_to_vcpu(ctxt));
}
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
+{
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -6104,11 +6368,15 @@ static const struct x86_emulate_ops emulate_ops = {
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .guest_has_long_mode = emulator_guest_has_long_mode,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
.pre_leave_smm = emulator_pre_leave_smm,
.post_leave_smm = emulator_post_leave_smm,
+ .set_xcr = emulator_set_xcr,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6168,7 +6436,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int ret;
@@ -6180,44 +6448,50 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
ctxt->_eip = ctxt->eip + inc_eip;
ret = emulate_int_real(ctxt, irq);
- if (ret != X86EMUL_CONTINUE)
- return EMULATE_FAIL;
-
- ctxt->eip = ctxt->_eip;
- kvm_rip_write(vcpu, ctxt->eip);
- kvm_set_rflags(vcpu, ctxt->eflags);
-
- return EMULATE_DONE;
+ if (ret != X86EMUL_CONTINUE) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ } else {
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ }
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
- int r = EMULATE_DONE;
-
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ if (emulation_type & EMULTYPE_SKIP) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
- r = EMULATE_USER_EXIT;
+ return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
- return r;
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool write_fault_to_shadow_pgtable,
int emulation_type)
{
- gpa_t gpa = cr2;
+ gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
@@ -6231,7 +6505,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
* Write permission should be allowed since only
* write access need to be emulated.
*/
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
/*
* If the mapping is invalid in guest, let cpu retry
@@ -6288,10 +6562,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- unsigned long cr2, int emulation_type)
+ gpa_t cr2_or_gpa, int emulation_type)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
last_retry_eip = vcpu->arch.last_retry_eip;
last_retry_addr = vcpu->arch.last_retry_addr;
@@ -6320,14 +6594,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (x86_page_table_writing_insn(ctxt))
return false;
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
return false;
vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
if (!vcpu->arch.mmu->direct_map)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6365,7 +6639,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
@@ -6374,18 +6648,20 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ return 0;
}
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ return 1;
}
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
- int r = EMULATE_DONE;
+ int r;
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ r = kvm_x86_ops->skip_emulated_instruction(vcpu);
+ if (unlikely(!r))
+ return 0;
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -6396,8 +6672,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
* that sets the TF flag".
*/
if (unlikely(rflags & X86_EFLAGS_TF))
- kvm_vcpu_do_singlestep(vcpu, &r);
- return r == EMULATE_DONE;
+ r = kvm_vcpu_do_singlestep(vcpu);
+ return r;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -6416,7 +6692,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
+ *r = 0;
return true;
}
}
@@ -6432,7 +6708,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
- *r = EMULATE_DONE;
+ *r = 1;
return true;
}
}
@@ -6471,11 +6747,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
{
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -6516,32 +6789,49 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
if (r != EMULATION_OK) {
- if (emulation_type & EMULTYPE_TRAP_UD)
- return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
- emulation_type))
- return EMULATE_DONE;
- if (ctxt->have_exception && inject_emulated_exception(vcpu))
- return EMULATE_DONE;
- if (emulation_type & EMULTYPE_SKIP)
- return EMULATE_FAIL;
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ write_fault_to_spt,
+ emulation_type))
+ return 1;
+ if (ctxt->have_exception) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
+ return 1;
+ }
return handle_emulation_failure(vcpu, emulation_type);
}
}
- if ((emulation_type & EMULTYPE_VMWARE) &&
- !is_vmware_backdoor_opcode(ctxt))
- return EMULATE_FAIL;
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
+ !is_vmware_backdoor_opcode(ctxt)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /*
+ * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
+ * for kvm_skip_emulated_instruction(). The caller is responsible for
+ * updating interruptibility state and injecting single-step #DBs.
+ */
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
- return EMULATE_DONE;
+ return 1;
}
- if (retry_instruction(ctxt, cr2, emulation_type))
- return EMULATE_DONE;
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ return 1;
/* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
@@ -6552,23 +6842,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
restart:
/* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2;
+ ctxt->exception.address = cr2_or_gpa;
r = x86_emulate_insn(ctxt);
if (r == EMULATION_INTERCEPTED)
- return EMULATE_DONE;
+ return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
emulation_type))
- return EMULATE_DONE;
+ return 1;
return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
- r = EMULATE_DONE;
+ r = 1;
if (inject_emulated_exception(vcpu))
return r;
} else if (vcpu->arch.pio.count) {
@@ -6579,27 +6869,30 @@ restart:
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
- r = EMULATE_USER_EXIT;
+ r = 0;
} else if (vcpu->mmio_needed) {
+ ++vcpu->stat.mmio_exits;
+
if (!vcpu->mmio_is_write)
writeback = false;
- r = EMULATE_USER_EXIT;
+ r = 0;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
- r = EMULATE_DONE;
+ r = 1;
if (writeback) {
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
- kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
- exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r && ctxt->tf)
+ r = kvm_vcpu_do_singlestep(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
+ }
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -7170,8 +7463,8 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
{
struct kvm_lapic_irq lapic_irq;
- lapic_irq.shorthand = 0;
- lapic_irq.dest_mode = 0;
+ lapic_irq.shorthand = APIC_DEST_NOSHORT;
+ lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -7180,18 +7473,22 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+bool kvm_apicv_activated(struct kvm *kvm)
{
- if (!lapic_in_kernel(vcpu)) {
- WARN_ON_ONCE(vcpu->arch.apicv_active);
- return;
- }
- if (!vcpu->arch.apicv_active)
- return;
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
- vcpu->arch.apicv_active = false;
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+void kvm_apicv_init(struct kvm *kvm, bool enable)
+{
+ if (enable)
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+ else
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
}
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
{
@@ -7702,11 +7999,65 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
+{
+ cpumask_var_t cpus;
+
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+ vcpu_bitmap, cpus);
+
+ free_cpumask_var(cpus);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ kvm_apic_update_apicv(vcpu);
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+
+/*
+ * NOTE: Do not hold any lock prior to calling this.
+ *
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
+ * locked, because it calls __x86_set_memory_region() which does
+ * synchronize_srcu(&kvm->srcu).
+ */
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
+ !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
+ return;
+
+ if (activate) {
+ if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
+ !kvm_apicv_activated(kvm))
+ return;
+ } else {
+ if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
+ kvm_apicv_activated(kvm))
+ return;
+ }
+
+ trace_kvm_apicv_update_request(activate, bit);
+ if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
+ kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+}
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
@@ -7779,7 +8130,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
*/
put_page(page);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
{
@@ -7798,12 +8148,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
+ enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE;
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
- kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -7893,6 +8248,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -7995,8 +8352,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
trace_kvm_entry(vcpu->vcpu_id);
guest_enter_irqoff();
- /* The preempt notifier should have taken care of the FPU already. */
- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -8040,7 +8398,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops->handle_exit_irqoff(vcpu);
+ kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath);
/*
* Consume any pending interrupts, including the possible source of
@@ -8084,7 +8442,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_lapic_sync_from_vapic(vcpu);
vcpu->arch.gpa_available = false;
- r = kvm_x86_ops->handle_exit(vcpu);
+ r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
return r;
cancel_injection:
@@ -8191,12 +8549,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
int r;
+
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE)
- return 0;
- return 1;
+ return r;
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -8269,12 +8626,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
+static void kvm_save_current_fpu(struct fpu *fpu)
+{
+ /*
+ * If the target FPU state is not resident in the CPU registers, just
+ * memcpy() from current, else save CPU state directly to the target.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ memcpy(&fpu->state, &current->thread.fpu.state,
+ fpu_kernel_xstate_size);
+ else
+ copy_fpregs_to_fpstate(fpu);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
+
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
@@ -8290,7 +8661,8 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
+
copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
fpregs_mark_activate();
@@ -8512,6 +8884,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
@@ -8520,6 +8894,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return 0;
}
@@ -8535,8 +8911,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
goto out;
- /* INITs are latched while in SMM */
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ /*
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+ * INIT state; latched init should be reported using
+ * KVM_SET_VCPU_EVENTS, so reject it here.
+ */
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
@@ -8564,23 +8944,21 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
-
- if (ret)
- return EMULATE_FAIL;
+ if (ret) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return EMULATE_DONE;
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- (sregs->cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
-
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
* When EFER.LME and CR0.PG are set, the processor is in
@@ -8599,7 +8977,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return -EINVAL;
}
- return 0;
+ return kvm_valid_cr4(vcpu, sregs->cr4);
}
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
@@ -8629,7 +9007,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -8877,33 +9255,91 @@ static void fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= X86_CR0_ET;
}
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
-
- kvmclock_reset(vcpu);
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
- kvm_x86_ops->vcpu_free(vcpu);
- free_cpumask_var(wbinvd_dirty_mask);
+ return 0;
}
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu;
+ struct page *page;
+ int r;
- if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
- return vcpu;
-}
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ if (kvm_apicv_activated(vcpu->kvm))
+ vcpu->arch.apicv_active = true;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks)
+ goto fail_free_pio_data;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.user_fpu) {
+ pr_err("kvm: failed to allocate userspace's fpu\n");
+ goto free_wbinvd_dirty_mask;
+ }
+
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.guest_fpu) {
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
+ goto free_user_fpu;
+ }
+ fx_init(vcpu);
+
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ r = kvm_x86_ops->vcpu_create(vcpu);
+ if (r)
+ goto free_guest_fpu;
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
-{
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
@@ -8912,6 +9348,22 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
+
+free_guest_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+free_user_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -8944,13 +9396,29 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+ int idx;
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
+
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -8966,7 +9434,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
- vcpu->arch.exception.pending = false;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
@@ -9142,6 +9609,8 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
if (kvm_has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -9156,6 +9625,9 @@ int kvm_arch_hardware_setup(void)
kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
}
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
kvm_init_msr_list();
return 0;
}
@@ -9167,6 +9639,13 @@ void kvm_arch_hardware_unsetup(void)
int kvm_arch_check_processor_compat(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ WARN_ON(!irqs_disabled());
+
+ if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits)
+ return -EIO;
+
return kvm_x86_ops->check_processor_compatibility();
}
@@ -9184,101 +9663,15 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int r;
-
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->arch.pio_data = page_address(page);
-
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
- } else
- static_key_slow_inc(&kvm_no_apic_vcpu);
-
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.mce_banks) {
- r = -ENOMEM;
- goto fail_free_lapic;
- }
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
-
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
- GFP_KERNEL_ACCOUNT)) {
- r = -ENOMEM;
- goto fail_free_mce_banks;
- }
-
- fx_init(vcpu);
-
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
- kvm_async_pf_hash_reset(vcpu);
- kvm_pmu_init(vcpu);
-
- vcpu->arch.pending_external_vector = -1;
- vcpu->arch.preempted_in_kernel = false;
-
- kvm_hv_vcpu_init(vcpu);
-
- return 0;
-
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
-fail_free_lapic:
- kvm_free_lapic(vcpu);
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- int idx;
-
- kvm_hv_vcpu_uninit(vcpu);
- kvm_pmu_destroy(vcpu);
- kfree(vcpu->arch.mce_banks);
- kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- free_page((unsigned long)vcpu->arch.pio_data);
- if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
-}
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
vcpu->arch.l1tf_flush_l1d = true;
+ if (pmu->version && unlikely(pmu->event_count)) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
kvm_x86_ops->sched_in(vcpu, cpu);
}
@@ -9289,6 +9682,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -9302,7 +9697,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -9314,10 +9709,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- if (kvm_x86_ops->vm_init)
- return kvm_x86_ops->vm_init(kvm);
+ return kvm_x86_ops->vm_init(kvm);
+}
- return 0;
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -9340,7 +9737,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_free(vcpu);
+ kvm_vcpu_destroy(vcpu);
mutex_lock(&kvm->lock);
for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
@@ -9409,17 +9806,10 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
- int r;
-
- mutex_lock(&kvm->slots_lock);
- r = __x86_set_memory_region(kvm, id, gpa, size);
- mutex_unlock(&kvm->slots_lock);
-
- return r;
+ kvm_mmu_pre_destroy_vm(kvm);
}
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
void kvm_arch_destroy_vm(struct kvm *kvm)
{
@@ -9429,9 +9819,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
* unless the the memory map has changed due to process exit
* or fd copying.
*/
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
}
if (kvm_x86_ops->vm_destroy)
kvm_x86_ops->vm_destroy(kvm);
@@ -9535,11 +9929,18 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9569,7 +9970,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*
* The reason is, in case of PML, we need to set D-bit for any slots
* with dirty logging disabled in order to eliminate unnecessary GPA
- * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * logging in PML buffer (and potential PML buffer full VMEXIT). This
* guarantees leaving PML enabled during guest's lifetime won't have
* any additional overhead from PML when guest is running with dirty
* logging disabled for memory slots.
@@ -9621,8 +10022,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* Scan sptes if dirty logging has been stopped, dropping those
* which can be collapsed into a single large-page spte. Later
* page faults will create the large-page sptes.
+ *
+ * There is no need to do this in any of the following cases:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
*/
- if ((change != KVM_MR_DELETE) &&
+ if (change == KVM_MR_FLAGS_ONLY &&
(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_zap_collapsible_sptes(kvm, new);
@@ -9786,7 +10192,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
return;
- vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -9899,7 +10305,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
{
struct x86_exception fault;
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (kvm_can_deliver_async_pf(vcpu) &&
@@ -9934,7 +10340,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
!apf_get_user(vcpu, &val)) {
@@ -10009,7 +10415,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
bool kvm_arch_has_irq_bypass(void)
{
- return kvm_x86_ops->update_pi_irte != NULL;
+ return true;
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
@@ -10049,9 +10455,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- if (!kvm_x86_ops->update_pi_irte)
- return -EINVAL;
-
return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}
@@ -10059,7 +10462,6 @@ bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
}
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
@@ -10067,6 +10469,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+{
+ uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
+ bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
+ !boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ bits &= ~SPEC_CTRL_SSBD;
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ bits &= ~SPEC_CTRL_SSBD;
+
+ return bits;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
@@ -10078,12 +10502,14 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6594020c0691..3624665acee4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
}
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
@@ -166,21 +161,13 @@ static inline u64 get_canonical(u64 la, u8 vaddr_bits)
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
{
-#ifdef CONFIG_X86_64
return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
-#else
- return false;
-#endif
}
static inline bool emul_is_noncanonical_address(u64 la,
struct x86_emulate_ctxt *ctxt)
{
-#ifdef CONFIG_X86_64
return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
-#else
- return false;
-#endif
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
@@ -196,7 +183,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
* actually a nGPA.
*/
vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
- vcpu->arch.access = access;
+ vcpu->arch.mmio_access = access;
vcpu->arch.mmio_gfn = gfn;
vcpu->arch.mmio_gen = gen;
}
@@ -238,8 +225,7 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
-static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg)
{
unsigned long val = kvm_register_read(vcpu, reg);
@@ -247,8 +233,7 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
}
static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
- unsigned long val)
+ int reg, unsigned long val)
{
if (!is_64_bit_mode(vcpu))
val = (u32)val;
@@ -260,8 +245,13 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
+static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
+{
+ return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu);
+}
+
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -286,8 +276,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -366,7 +357,14 @@ static inline bool kvm_pat_valid(u64 data)
return (data | ((data & 0x0202020202020202ull) << 1)) == data;
}
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+static inline bool kvm_dr7_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
+u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
#endif
OpenPOWER on IntegriCloud