summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c104
-rw-r--r--arch/x86/kvm/lapic.c18
-rw-r--r--arch/x86/kvm/lapic.h5
-rw-r--r--arch/x86/kvm/mmu.c347
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h29
-rw-r--r--arch/x86/kvm/svm.c16
-rw-r--r--arch/x86/kvm/vmx/nested.c66
-rw-r--r--arch/x86/kvm/vmx/nested.h13
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c7
-rw-r--r--arch/x86/kvm/vmx/vmx.c64
-rw-r--r--arch/x86/kvm/vmx/vmx.h11
-rw-r--r--arch/x86/kvm/x86.c186
13 files changed, 642 insertions, 228 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 63316036f85a..f68c0c753c38 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
@@ -485,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
@@ -618,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
*/
case 0x1f:
case 0xb: {
- int i, level_type;
+ int i;
- /* read more entries until level_type is zero */
- for (i = 1; ; ++i) {
+ /*
+ * We filled in entry[0] for CPUID(EAX=<function>,
+ * ECX=00H) above. If its level type (ECX[15:8]) is
+ * zero, then the leaf is unimplemented, and we're
+ * done. Otherwise, continue to populate entries
+ * until the level type (ECX[15:8]) of the previously
+ * added entry is zero.
+ */
+ for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
if (*nent >= maxnent)
goto out;
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
do_host_cpuid(&entry[i], function, i);
++*nent;
}
@@ -969,53 +974,66 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
+ * If the basic or extended CPUID leaf requested is higher than the
+ * maximum supported basic or extended leaf, respectively, then it is
+ * out of range.
*/
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
+static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function)
{
- struct kvm_cpuid_entry2 *maxlevel;
-
- maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
- if (!maxlevel || maxlevel->eax >= function)
- return NULL;
- if (function & 0x80000000) {
- maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
- if (!maxlevel)
- return NULL;
- }
- return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+ struct kvm_cpuid_entry2 *max;
+
+ max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ return max && function <= max->eax;
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
- struct kvm_cpuid_entry2 *best;
- bool entry_found = true;
-
- best = kvm_find_cpuid_entry(vcpu, function, index);
-
- if (!best) {
- entry_found = false;
- if (!check_limit)
- goto out;
+ struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *max;
+ bool found;
- best = check_cpuid_limit(vcpu, function, index);
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ found = entry;
+ /*
+ * Intel CPUID semantics treats any query for an out-of-range
+ * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were
+ * requested. AMD CPUID semantics returns all zeroes for any
+ * undefined leaf, whether or not the leaf is in range.
+ */
+ if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
+ !cpuid_function_in_range(vcpu, function)) {
+ max = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (max) {
+ function = max->eax;
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ }
}
-
-out:
- if (best) {
- *eax = best->eax;
- *ebx = best->ebx;
- *ecx = best->ecx;
- *edx = best->edx;
- } else
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ } else {
*eax = *ebx = *ecx = *edx = 0;
- trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
- return entry_found;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
+ return found;
}
EXPORT_SYMBOL_GPL(kvm_cpuid);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3a3a6854dcca..b29d00b661ff 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -66,9 +66,10 @@
#define X2APIC_BROADCAST 0xFFFFFFFFul
static bool lapic_timer_advance_dynamic __read_mostly;
-#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100
-#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000
-#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -110,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
-{
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
return apic->vcpu->vcpu_id;
@@ -1504,8 +1500,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}
- if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX))
- timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
@@ -2302,7 +2298,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
HRTIMER_MODE_ABS_HARD);
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
- apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
lapic_timer_advance_dynamic = true;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2aad7e226fc0..1f5014852e20 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -242,4 +242,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
}
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5269aa057dfa..2ce9da58611e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kthread.h>
#include <asm/page.h>
#include <asm/pat.h>
@@ -47,6 +48,35 @@
#include <asm/kvm_page_track.h>
#include "trace.h"
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+#ifdef CONFIG_PREEMPT_RT
+/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
+static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+#else
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+#endif
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
@@ -83,7 +113,17 @@ module_param(dbg, bool, 0644);
#define PTE_PREFETCH_NUM 8
#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs.
+ */
+#define SPTE_SPECIAL_MASK (3ULL << 52)
+#define SPTE_AD_ENABLED_MASK (0ULL << 52)
+#define SPTE_AD_DISABLED_MASK (1ULL << 52)
+#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
+#define SPTE_MMIO_MASK (3ULL << 52)
#define PT64_LEVEL_BITS 9
@@ -219,12 +259,11 @@ static u64 __read_mostly shadow_present_mask;
static u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
- * Non-present SPTEs with shadow_acc_track_value set are in place for access
- * tracking.
+ * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
*/
static u64 __read_mostly shadow_acc_track_mask;
-static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
@@ -304,7 +343,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
BUG_ON((mmio_mask & mmio_value) != mmio_value);
- shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
+ shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
shadow_mmio_access_mask = access_mask;
}
@@ -320,10 +359,32 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
return sp->role.ad_disabled;
}
+static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the log
+ * would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages. This also bypasses
+ * PML, since writes now result in a vmexit.
+ */
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+}
+
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(is_mmio_spte(spte));
- return !(spte & shadow_acc_track_value);
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+}
+
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -461,7 +522,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
{
BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & shadow_acc_track_value);
+ BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -1164,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
@@ -1181,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
@@ -1589,16 +1668,16 @@ static bool spte_clear_dirty(u64 *sptep)
rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
-
return mmu_spte_update(sptep, spte);
}
-static bool wrprot_ad_disabled_spte(u64 *sptep)
+static bool spte_wrprot_for_clear_dirty(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
- if (was_writable)
+ if (was_writable && !spte_ad_enabled(*sptep))
kvm_set_pfn_dirty(spte_to_pfn(*sptep));
return was_writable;
@@ -1617,10 +1696,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- if (spte_ad_enabled(*sptep))
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= spte_wrprot_for_clear_dirty(sptep);
else
- flush |= wrprot_ad_disabled_spte(sptep);
+ flush |= spte_clear_dirty(sptep);
return flush;
}
@@ -1631,6 +1710,11 @@ static bool spte_set_dirty(u64 *sptep)
rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+ /*
+ * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
+ * do not bother adding back write access to pages marked
+ * SPTE_AD_WRPROT_ONLY_MASK.
+ */
spte |= shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
@@ -2622,7 +2706,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -2761,6 +2845,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
kvm_reload_remote_mmus(kvm);
}
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
sp->role.invalid = 1;
return list_unstable;
}
@@ -2968,7 +3055,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
sp = page_header(__pa(sptep));
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
+ else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ spte |= SPTE_AD_WRPROT_ONLY_MASK;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -2980,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= spte_shadow_accessed_mask(spte);
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -3200,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
int map_writable, int level, kvm_pfn_t pfn,
- bool prefault)
+ bool prefault, bool lpage_disallowed)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
@@ -3215,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
break;
@@ -3225,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3273,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* here.
*/
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- level == PT_PAGE_TABLE_LEVEL &&
+ !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
PageTransCompoundMap(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
@@ -3517,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- bool force_pt_level = false;
+ bool force_pt_level;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ force_pt_level = lpage_disallowed;
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
/*
@@ -3555,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+ prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
@@ -4141,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
@@ -4151,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
- PT_DIRECTORY_LEVEL);
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
if (level > PT_DIRECTORY_LEVEL &&
@@ -4181,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
@@ -5881,9 +6014,9 @@ restart:
* the guest, and the guest page table is using 4K page size
* mapping if the indirect sp has level = 1.
*/
- if (sp->role.direct &&
- !kvm_is_reserved_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) &&
+ PageTransCompoundMap(pfn_to_page(pfn))) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
@@ -6122,10 +6255,59 @@ static void kvm_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_mmu_zap_all_fast(kvm);
+ mutex_unlock(&kvm->slots_lock);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
/*
* MMU roles use union aliasing which is, generally speaking, an
* undefined behavior. However, we supposedly know how compilers behave
@@ -6205,3 +6387,116 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 11f8ec89433b..d55674f44a18 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7d5cdb3af594..97b21e7fd013 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- kvm_pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
int top_level, ret;
- gfn_t base_gfn;
+ gfn_t gfn, base_gfn;
direct_access = gw->pte_access;
@@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- base_gfn = gw->gfn;
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == hlevel)
break;
@@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault);
+ level, pfn, map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f8ecb6df5106..c5673bda4b66 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -734,8 +734,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -4591,6 +4597,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (ldr == svm->ldr_reg)
return 0;
@@ -4598,7 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
avic_invalidate_logical_id_entry(vcpu);
if (ldr)
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
+ ret = avic_ldr_write(vcpu, id, ldr);
if (!ret)
svm->ldr_reg = ldr;
@@ -4610,8 +4617,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
{
u64 *old, *new;
struct vcpu_svm *svm = to_svm(vcpu);
- u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
- u32 id = (apic_id_reg >> 24) & 0xff;
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (vcpu->vcpu_id == id)
return 0;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 41abc62c9a8a..0e7c9301fe86 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
/* VM-entry exception error code */
if (CC(has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)))
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
@@ -2917,7 +2917,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2937,19 +2937,18 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
vmx->nested.apic_access_page = NULL;
}
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
if (!is_error_page(page)) {
vmx->nested.apic_access_page = page;
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
}
}
@@ -2994,6 +2993,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ return true;
}
/*
@@ -3032,13 +3032,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
/*
* If from_vmentry is false, this is being called from state restore (either RSM
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
-+ *
-+ * Returns:
-+ * 0 - success, i.e. proceed with actual VMEnter
-+ * 1 - consistency check VMExit
-+ * -1 - consistency check VMFail
+ *
+ * Returns:
+ * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_ENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_ENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
*/
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -3081,11 +3083,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- nested_get_vmcs12_pages(vcpu);
+ if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return -1;
+ return NVMX_VMENTRY_VMFAIL;
}
if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
@@ -3149,7 +3152,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* returned as far as L1 is concerned. It will only return (and set
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
- return 0;
+ return NVMX_VMENTRY_SUCCESS;
/*
* A failed consistency check that leads to a VMExit during L1's
@@ -3165,14 +3168,14 @@ vmentry_fail_vmexit:
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
if (!from_vmentry)
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
vmcs12->exit_qualification = exit_qual;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
}
/*
@@ -3182,9 +3185,9 @@ vmentry_fail_vmexit:
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
struct vmcs12 *vmcs12;
+ enum nvmx_vmentry_status status;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
- int ret;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -3244,13 +3247,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the nested entry.
*/
vmx->nested.nested_run_pending = 1;
- ret = nested_vmx_enter_non_root_mode(vcpu, true);
- vmx->nested.nested_run_pending = !ret;
- if (ret > 0)
- return 1;
- else if (ret)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ status = nested_vmx_enter_non_root_mode(vcpu, true);
+ if (unlikely(status != NVMX_VMENTRY_SUCCESS))
+ goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3281,6 +3280,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return kvm_vcpu_halt(vcpu);
}
return 1;
+
+vmentry_failed:
+ vmx->nested.nested_run_pending = 0;
+ if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
+ return 0;
+ if (status == NVMX_VMENTRY_VMEXIT)
+ return 1;
+ WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 187d39bf0bf1..6280f33e5fa6 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -6,6 +6,16 @@
#include "vmcs12.h"
#include "vmx.h"
+/*
+ * Status returned by nested_vmx_enter_non_root_mode():
+ */
+enum nvmx_vmentry_status {
+ NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */
+ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */
+ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */
+ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */
+};
+
void vmx_leave_nested(struct kvm_vcpu *vcpu);
void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
bool apicv);
@@ -13,7 +23,8 @@ void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_vcpu_setup(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry);
bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
u32 exit_intr_info, unsigned long exit_qualification);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 4dea0e0e7e39..3e9c059099e9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
@@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (!pmu->version)
return;
+ perf_get_x86_pmu_capability(&x86_pmu);
+
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- INTEL_PMC_MAX_GENERIC);
+ x86_pmu.num_counters_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
} else {
pmu->nr_arch_fixed_counters =
min_t(int, edx.split.num_counters_fixed,
- INTEL_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d4575ffb3cec..04a8212704c1 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
struct page *page;
unsigned int i;
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
@@ -964,17 +969,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -1271,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
+ /*
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+ * correctly.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ goto after_clear_sn;
+ }
+
/* The full case. */
do {
old.control = new.control = pi_desc->control;
@@ -1286,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
+after_clear_sn:
+
/*
* Clear SN before reading the bitmap. The VT-d firmware
* writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1294,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
*/
smp_mb__after_atomic();
- if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+ if (!pi_is_pir_empty(pi_desc))
pi_set_on(pi_desc);
}
@@ -5538,14 +5549,6 @@ static int handle_encls(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x",
- vmcs_read32(VM_EXIT_REASON));
- return 1;
-}
-
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5597,15 +5600,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
[EXIT_REASON_RDRAND] = handle_invalid_op,
[EXIT_REASON_RDSEED] = handle_invalid_op,
- [EXIT_REASON_XSAVES] = handle_unexpected_vmexit,
- [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
- [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit,
- [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6152,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
- * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
@@ -6182,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_test_on(vcpu_to_pi_desc(vcpu));
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
}
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -7995,12 +7997,10 @@ static int __init vmx_init(void)
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
- if (boot_cpu_has(X86_BUG_L1TF)) {
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
}
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index bee16687dc0b..5a0f34b1e226 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_SN,
@@ -373,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
static inline int pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ed07d8d2caa..5d530521f11d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -212,7 +212,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages) },
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
{ "max_mmu_page_hash_collisions",
VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
@@ -360,8 +361,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base);
asmlinkage __visible void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
- if (!kvm_rebooting)
- BUG();
+ BUG_ON(!kvm_rebooting);
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
@@ -885,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
-
if (cr4 & CR4_RESERVED_BITS)
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ return -EINVAL;
+
+ return 0;
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (kvm_valid_cr4(vcpu, cr4))
return 1;
if (is_long_mode(vcpu)) {
@@ -1125,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
- * This list is modified at module load time to reflect the
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
+ * extract the supported MSRs from the related const lists.
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
* may depend on host virtualization features rather than host cpu features.
*/
-static u32 msrs_to_save[] = {
+static const u32 msrs_to_save_all[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1161,13 +1171,6 @@ static u32 msrs_to_save[] = {
MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
- MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19,
- MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21,
- MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23,
- MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25,
- MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27,
- MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29,
- MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
@@ -1177,18 +1180,12 @@ static u32 msrs_to_save[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
- MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19,
- MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21,
- MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23,
- MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25,
- MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27,
- MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29,
- MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31,
};
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
static unsigned num_msrs_to_save;
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs_all[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1227,7 +1224,7 @@ static u32 emulated_msrs[] = {
* by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
* We always support the "true" VMX control MSRs, even if the host
* processor does not, so I am putting these registers here rather
- * than in msrs_to_save.
+ * than in msrs_to_save_all.
*/
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1246,13 +1243,14 @@ static u32 emulated_msrs[] = {
MSR_KVM_POLL_CONTROL,
};
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
* List of msr numbers which are used to expose MSR-based features that
* can be used by a hypervisor to validate requested CPU features.
*/
-static u32 msr_based_features[] = {
+static const u32 msr_based_features_all[] = {
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
MSR_IA32_VMX_PINBASED_CTLS,
@@ -1277,6 +1275,7 @@ static u32 msr_based_features[] = {
MSR_IA32_ARCH_CAPABILITIES,
};
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;
static u64 kvm_get_arch_capabilities(void)
@@ -1287,6 +1286,14 @@ static u64 kvm_get_arch_capabilities(void)
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
/*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
@@ -1305,6 +1312,25 @@ static u64 kvm_get_arch_capabilities(void)
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
+ (data & ARCH_CAP_TSX_CTRL_MSR))
+ data &= ~ARCH_CAP_MDS_NO;
+
return data;
}
@@ -2543,6 +2569,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.pv_time_enabled = false;
+ vcpu->arch.time = 0;
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
@@ -2708,8 +2735,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME: {
struct kvm_arch *ka = &vcpu->kvm->arch;
- kvmclock_reset(vcpu);
-
if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
@@ -2723,14 +2748,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
+ vcpu->arch.pv_time_enabled = false;
if (!(data & 1))
break;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = false;
- else
vcpu->arch.pv_time_enabled = true;
break;
@@ -5097,23 +5121,28 @@ out:
static void kvm_init_msr_list(void)
{
+ struct x86_pmu_capability x86_pmu;
u32 dummy[2];
- unsigned i, j;
+ unsigned i;
BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
- "Please update the fixed PMCs in msrs_to_save[]");
- BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32,
- "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]");
+ "Please update the fixed PMCs in msrs_to_saved_all[]");
+
+ perf_get_x86_pmu_capability(&x86_pmu);
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
continue;
/*
* Even MSRs that are valid in the host may not be exposed
* to the guests in some cases.
*/
- switch (msrs_to_save[i]) {
+ switch (msrs_to_save_all[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported())
continue;
@@ -5141,43 +5170,43 @@ static void kvm_init_msr_list(void)
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
if (!kvm_x86_ops->pt_supported() ||
- msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
}
default:
break;
}
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
}
- num_msrs_to_save = j;
- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
continue;
- if (j < i)
- emulated_msrs[j] = emulated_msrs[i];
- j++;
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- num_emulated_msrs = j;
- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
struct kvm_msr_entry msr;
- msr.index = msr_based_features[i];
+ msr.index = msr_based_features_all[i];
if (kvm_get_msr_feature(&msr))
continue;
- if (j < i)
- msr_based_features[j] = msr_based_features[i];
- j++;
+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
}
- num_msr_based_features = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -7937,8 +7966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
- kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -8714,10 +8747,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- (sregs->cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
-
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
* When EFER.LME and CR0.PG are set, the processor is in
@@ -8736,7 +8765,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return -EINVAL;
}
- return 0;
+ return kvm_valid_cr4(vcpu, sregs->cr4);
}
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
@@ -9427,6 +9456,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -9455,6 +9485,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return kvm_x86_ops->vm_init(kvm);
}
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
@@ -9556,6 +9591,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(x86_set_memory_region);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
OpenPOWER on IntegriCloud