summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_arm.h3
-rw-r--r--arch/arm/include/asm/kvm_host.h13
-rw-r--r--arch/arm/include/asm/kvm_mmu.h15
-rw-r--r--arch/arm/include/asm/stage2_pgtable.h54
-rw-r--r--arch/arm64/include/asm/cpufeature.h20
-rw-r--r--arch/arm64/include/asm/kvm_arm.h155
-rw-r--r--arch/arm64/include/asm/kvm_asm.h3
-rw-r--r--arch/arm64/include/asm/kvm_host.h18
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h10
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h42
-rw-r--r--arch/arm64/include/asm/ptrace.h3
-rw-r--r--arch/arm64/include/asm/stage2_pgtable-nopmd.h42
-rw-r--r--arch/arm64/include/asm/stage2_pgtable-nopud.h39
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h236
-rw-r--r--arch/arm64/kvm/guest.c6
-rw-r--r--arch/arm64/kvm/handle_exit.c7
-rw-r--r--arch/arm64/kvm/hyp/Makefile1
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S16
-rw-r--r--arch/arm64/kvm/hyp/s2-setup.c90
-rw-r--r--arch/arm64/kvm/hyp/switch.c4
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c19
-rw-r--r--arch/arm64/kvm/hyp/tlb.c4
-rw-r--r--arch/arm64/kvm/reset.c108
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/mem_encrypt.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/virtext.h2
-rw-r--r--arch/x86/include/asm/vmx.h13
-rw-r--r--arch/x86/include/uapi/asm/kvm.h8
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h17
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c27
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c53
-rw-r--r--arch/x86/kernel/head64.c20
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/kvmclock.c52
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S19
-rw-r--r--arch/x86/kvm/hyperv.c280
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/lapic.c45
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c393
-rw-r--r--arch/x86/kvm/mmu.h13
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/paging_tmpl.h15
-rw-r--r--arch/x86/kvm/svm.c64
-rw-r--r--arch/x86/kvm/trace.h42
-rw-r--r--arch/x86/kvm/vmx.c2281
-rw-r--r--arch/x86/kvm/vmx_shadow_fields.h5
-rw-r--r--arch/x86/kvm/x86.c257
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c24
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/xen/mmu_pv.c8
-rw-r--r--arch/x86/xen/pmu.c2
57 files changed, 3246 insertions, 1445 deletions
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 3ab8b3781bfe..c3f1f9b304b7 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -133,8 +133,7 @@
* space.
*/
#define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL))
+
#define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
/* Virtualization Translation Control Register (VTCR) bits */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 3ad482d2f1eb..5ca5d9af0c26 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
kvm_call_hyp(__init_stage2_translation);
}
-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
return 0;
}
@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);
+static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+ /*
+ * On 32bit ARM, VMs get a static 40bit IPA stage2 setup,
+ * so any non-zero value used as type is illegal.
+ */
+ if (type)
+ return -EINVAL;
+ return 0;
+}
+
#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 265ea9cf7df7..5ad1a54f98dc 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -35,16 +35,12 @@
addr; \
})
-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
- */
-#define KVM_MMU_CACHE_MIN_PAGES 2
-
#ifndef __ASSEMBLY__
#include <linux/highmem.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
+#include <asm/kvm_arm.h>
#include <asm/kvm_hyp.h>
#include <asm/pgalloc.h>
#include <asm/stage2_pgtable.h>
@@ -52,6 +48,13 @@
/* Ensure compatibility with arm64 */
#define VA_BITS 32
+#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT
+#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL)
+#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK
+
+#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t))
+
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
@@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void)
#define kvm_phys_to_vttbr(addr) (addr)
+static inline void kvm_set_ipa_limit(void) {}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index 460d616bb2d6..f6a7ea805232 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -19,43 +19,53 @@
#ifndef __ARM_S2_PGTABLE_H_
#define __ARM_S2_PGTABLE_H_
-#define stage2_pgd_none(pgd) pgd_none(pgd)
-#define stage2_pgd_clear(pgd) pgd_clear(pgd)
-#define stage2_pgd_present(pgd) pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address)
-#define stage2_pud_free(pud) pud_free(NULL, pud)
-
-#define stage2_pud_none(pud) pud_none(pud)
-#define stage2_pud_clear(pud) pud_clear(pud)
-#define stage2_pud_present(pud) pud_present(pud)
-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address)
-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd)
-
-#define stage2_pud_huge(pud) pud_huge(pud)
+/*
+ * kvm_mmu_cache_min_pages() is the number of pages required
+ * to install a stage-2 translation. We pre-allocate the entry
+ * level table at VM creation. Since we have a 3 level page-table,
+ * we need only two pages to add a new mapping.
+ */
+#define kvm_mmu_cache_min_pages(kvm) 2
+
+#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
+#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
+#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
+#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address)
+#define stage2_pud_free(kvm, pud) pud_free(NULL, pud)
+
+#define stage2_pud_none(kvm, pud) pud_none(pud)
+#define stage2_pud_clear(kvm, pud) pud_clear(pud)
+#define stage2_pud_present(kvm, pud) pud_present(pud)
+#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address)
+#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(kvm, pud) pud_huge(pud)
/* Open coded p*d_addr_end that can deal with 64bit addresses */
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
}
-#define stage2_pud_addr_end(addr, end) (end)
+#define stage2_pud_addr_end(kvm, addr, end) (end)
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
}
-#define stage2_pgd_index(addr) pgd_index(addr)
+#define stage2_pgd_index(kvm, addr) pgd_index(addr)
-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep)
-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#define stage2_pud_table_empty(pudp) false
+#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
+#define stage2_pud_table_empty(kvm, pudp) false
#endif /* __ARM_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 1717ba1db35d..072cc1c970c2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -530,6 +530,26 @@ void arm64_set_ssbd_mitigation(bool state);
static inline void arm64_set_ssbd_mitigation(bool state) {}
#endif
+static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+{
+ switch (parange) {
+ case 0: return 32;
+ case 1: return 36;
+ case 2: return 40;
+ case 3: return 42;
+ case 4: return 44;
+ case 5: return 48;
+ case 6: return 52;
+ /*
+ * A future PE could use a value unknown to the kernel.
+ * However, by the "D10.1.4 Principles of the ID scheme
+ * for fields in ID registers", ARM DDI 0487C.a, any new
+ * value is guaranteed to be higher than what we know already.
+ * As a safe limit, we return the limit supported by the kernel.
+ */
+ default: return CONFIG_ARM64_PA_BITS;
+ }
+}
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index aa45df752a16..6e324d1f1231 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
#define VTCR_EL2_RES1 (1 << 31)
#define VTCR_EL2_HD (1 << 22)
#define VTCR_EL2_HA (1 << 21)
+#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT
#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK
#define VTCR_EL2_TG0_MASK TCR_TG0_MASK
#define VTCR_EL2_TG0_4K TCR_TG0_4K
@@ -120,62 +121,149 @@
#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA
#define VTCR_EL2_SL0_SHIFT 6
#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT)
-#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT)
#define VTCR_EL2_T0SZ_MASK 0x3f
-#define VTCR_EL2_T0SZ_40B 24
#define VTCR_EL2_VS_SHIFT 19
#define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT)
#define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT)
+#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x)
+
/*
* We configure the Stage-2 page tables to always restrict the IPA space to be
* 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are
* not known to exist and will break with this configuration.
*
- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
- * (see hyp-init.S).
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
*
* Note that when using 4K pages, we concatenate two first level page tables
* together. With 16K pages, we concatenate 16 first level page tables.
*
- * The magic numbers used for VTTBR_X in this patch can be found in Tables
- * D4-23 and D4-25 in ARM DDI 0487A.b.
*/
-#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B
#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
-#ifdef CONFIG_ARM64_64K_PAGES
/*
- * Stage2 translation configuration:
- * 64kB pages (TG0 = 1)
- * 2 level page tables (SL = 1)
+ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
+ * Interestingly, it depends on the page size.
+ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
+ *
+ * -----------------------------------------
+ * | Entry level | 4K | 16K/64K |
+ * ------------------------------------------
+ * | Level: 0 | 2 | - |
+ * ------------------------------------------
+ * | Level: 1 | 1 | 2 |
+ * ------------------------------------------
+ * | Level: 2 | 0 | 1 |
+ * ------------------------------------------
+ * | Level: 3 | - | 0 |
+ * ------------------------------------------
+ *
+ * The table roughly translates to :
+ *
+ * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+ *
+ * Where TGRAN_SL0_BASE is a magic number depending on the page size:
+ * TGRAN_SL0_BASE(4K) = 2
+ * TGRAN_SL0_BASE(16K) = 3
+ * TGRAN_SL0_BASE(64K) = 3
+ * provided we take care of ruling out the unsupported cases and
+ * Entry_Level = 4 - Number_of_levels.
+ *
*/
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 38
+#ifdef CONFIG_ARM64_64K_PAGES
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K
+#define VTCR_EL2_TGRAN_SL0_BASE 3UL
+
#elif defined(CONFIG_ARM64_16K_PAGES)
-/*
- * Stage2 translation configuration:
- * 16kB pages (TG0 = 2)
- * 2 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 42
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K
+#define VTCR_EL2_TGRAN_SL0_BASE 3UL
+
#else /* 4K */
-/*
- * Stage2 translation configuration:
- * 4kB pages (TG0 = 0)
- * 3 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 37
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K
+#define VTCR_EL2_TGRAN_SL0_BASE 2UL
+
#endif
-#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
-#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+#define VTCR_EL2_LVLS_TO_SL0(levels) \
+ ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
+#define VTCR_EL2_SL0_TO_LVLS(sl0) \
+ ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
+#define VTCR_EL2_LVLS(vtcr) \
+ VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
+
+#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
+#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
+
+/*
+ * ARM VMSAv8-64 defines an algorithm for finding the translation table
+ * descriptors in section D4.2.8 in ARM DDI 0487C.a.
+ *
+ * The algorithm defines the expectations on the translation table
+ * addresses for each level, based on PAGE_SIZE, entry level
+ * and the translation table size (T0SZ). The variable "x" in the
+ * algorithm determines the alignment of a table base address at a given
+ * level and thus determines the alignment of VTTBR:BADDR for stage2
+ * page table entry level.
+ * Since the number of bits resolved at the entry level could vary
+ * depending on the T0SZ, the value of "x" is defined based on a
+ * Magic constant for a given PAGE_SIZE and Entry Level. The
+ * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
+ * x = PAGE_SHIFT).
+ *
+ * The value of "x" for entry level is calculated as :
+ * x = Magic_N - T0SZ
+ *
+ * where Magic_N is an integer depending on the page size and the entry
+ * level of the page table as below:
+ *
+ * --------------------------------------------
+ * | Entry level | 4K 16K 64K |
+ * --------------------------------------------
+ * | Level: 0 (4 levels) | 28 | - | - |
+ * --------------------------------------------
+ * | Level: 1 (3 levels) | 37 | 31 | 25 |
+ * --------------------------------------------
+ * | Level: 2 (2 levels) | 46 | 42 | 38 |
+ * --------------------------------------------
+ * | Level: 3 (1 level) | - | 53 | 51 |
+ * --------------------------------------------
+ *
+ * We have a magic formula for the Magic_N below:
+ *
+ * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+ *
+ * where Number_of_levels = (4 - Level). We are only interested in the
+ * value for Entry_Level for the stage2 page table.
+ *
+ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
+ *
+ * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+ * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+ *
+ * Here is one way to explain the Magic Formula:
+ *
+ * x = log2(Size_of_Entry_Level_Table)
+ *
+ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
+ * PAGE_SHIFT bits in the PTE, we have :
+ *
+ * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
+ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+ * where n = number of levels, and since each pointer is 8bytes, we have:
+ *
+ * x = Bits_Entry_Level + 3
+ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+ *
+ * The only constraint here is that, we have to find the number of page table
+ * levels for a given IPA size (which we do, see stage2_pt_levels())
+ */
+#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3)))
-#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
#define VTTBR_VMID_SHIFT (UL(48))
#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
@@ -223,6 +311,13 @@
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))
+/*
+ * We have
+ * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12]
+ * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12]
+ */
+#define PAR_TO_HPFAR(par) \
+ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
#define kvm_arm_exception_type \
{0, "IRQ" }, \
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 102b5a5c47b6..aea01a09eb94 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -30,6 +30,7 @@
#define ARM_EXCEPTION_IRQ 0
#define ARM_EXCEPTION_EL1_SERROR 1
#define ARM_EXCEPTION_TRAP 2
+#define ARM_EXCEPTION_IL 3
/* The hyp-stub will return this for any kvm_call_hyp() call */
#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR
@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void);
extern u32 __kvm_get_mdcr_el2(void);
-extern u32 __init_stage2_translation(void);
-
/* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
#define __hyp_this_cpu_ptr(sym) \
({ \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d6d7336f871..f84052f306af 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_arch {
@@ -61,11 +61,13 @@ struct kvm_arch {
u64 vmid_gen;
u32 vmid;
- /* 1-level 2nd stage table, protected by kvm->mmu_lock */
+ /* stage2 entry level table */
pgd_t *pgd;
/* VTTBR value associated with above pgd and vmid */
u64 vttbr;
+ /* VTCR_EL2 value for this VM */
+ u64 vtcr;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -440,13 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
-static inline void __cpu_init_stage2(void)
-{
- u32 parange = kvm_call_hyp(__init_stage2_translation);
-
- WARN_ONCE(parange < 40,
- "PARange is %d bits, unsupported configuration!", parange);
-}
+static inline void __cpu_init_stage2(void) {}
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -509,8 +505,12 @@ static inline int kvm_arm_have_ssbd(void)
void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
+void kvm_set_ipa_limit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 384c34397619..23aca66767f9 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void);
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
void __noreturn __hyp_do_panic(unsigned long, ...);
+/*
+ * Must be called from hyp code running at EL2 with an updated VTTBR
+ * and interrupts disabled.
+ */
+static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
+{
+ write_sysreg(kvm->arch.vtcr, vtcr_el2);
+ write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d6fff7de5539..77b1af9e64db 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
* We currently only support a 40bit IPA.
*/
#define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
+
+#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr)
+#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
+
+static inline bool kvm_page_empty(void *ptr)
+{
+ struct page *ptr_page = virt_to_page(ptr);
+ return page_count(ptr_page) == 1;
+}
#include <asm/stage2_pgtable.h>
@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
}
-static inline bool kvm_page_empty(void *ptr)
-{
- struct page *ptr_page = virt_to_page(ptr);
- return page_count(ptr_page) == 1;
-}
-
#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
#ifdef __PAGETABLE_PMD_FOLDED
@@ -517,5 +519,29 @@ static inline int hyp_map_aux_data(void)
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
+/*
+ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
+ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
+ * 52bit IPS.
+ */
+static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
+{
+ int x = ARM64_VTTBR_X(ipa_shift, levels);
+
+ return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
+}
+
+static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
+{
+ unsigned int x = arm64_vttbr_x(ipa_shift, levels);
+
+ return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
+}
+
+static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
+{
+ return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 177b851ca6d9..ff35ac1258eb 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -25,6 +25,9 @@
#define CurrentEL_EL1 (1 << 2)
#define CurrentEL_EL2 (2 << 2)
+/* Additional SPSR bits not exposed in the UABI */
+#define PSR_IL_BIT (1 << 20)
+
/* AArch32-specific ptrace requests */
#define COMPAT_PTRACE_GETREGS 12
#define COMPAT_PTRACE_SETREGS 13
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
deleted file mode 100644
index 2656a0fd05a6..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
-#define __ARM64_S2_PGTABLE_NOPMD_H_
-
-#include <asm/stage2_pgtable-nopud.h>
-
-#define __S2_PGTABLE_PMD_FOLDED
-
-#define S2_PMD_SHIFT S2_PUD_SHIFT
-#define S2_PTRS_PER_PMD 1
-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK (~(S2_PMD_SIZE-1))
-
-#define stage2_pud_none(pud) (0)
-#define stage2_pud_present(pud) (1)
-#define stage2_pud_clear(pud) do { } while (0)
-#define stage2_pud_populate(pud, pmd) do { } while (0)
-#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud))
-
-#define stage2_pmd_free(pmd) do { } while (0)
-
-#define stage2_pmd_addr_end(addr, end) (end)
-
-#define stage2_pud_huge(pud) (0)
-#define stage2_pmd_table_empty(pmdp) (0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h
deleted file mode 100644
index 5ee87b54ebf3..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
-#define __ARM64_S2_PGTABLE_NOPUD_H_
-
-#define __S2_PGTABLE_PUD_FOLDED
-
-#define S2_PUD_SHIFT S2_PGDIR_SHIFT
-#define S2_PTRS_PER_PUD 1
-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
-#define S2_PUD_MASK (~(S2_PUD_SIZE-1))
-
-#define stage2_pgd_none(pgd) (0)
-#define stage2_pgd_present(pgd) (1)
-#define stage2_pgd_clear(pgd) do { } while (0)
-#define stage2_pgd_populate(pgd, pud) do { } while (0)
-
-#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd))
-
-#define stage2_pud_free(x) do { } while (0)
-
-#define stage2_pud_addr_end(addr, end) (end)
-#define stage2_pud_table_empty(pmdp) (0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 8b68099348e5..d352f6df8d2c 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -19,9 +19,17 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
+#include <linux/hugetlb.h>
#include <asm/pgtable.h>
/*
+ * PGDIR_SHIFT determines the size a top-level page table entry can map
+ * and depends on the number of levels in the page table. Compute the
+ * PGDIR_SHIFT for a given number of levels.
+ */
+#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
+
+/*
* The hardware supports concatenation of up to 16 tables at stage2 entry level
* and we use the feature whenever possible.
*
@@ -29,112 +37,208 @@
* On arm64, the smallest PAGE_SIZE supported is 4k, which means
* (PAGE_SHIFT - 3) > 4 holds for all page sizes.
* This implies, the total number of page table levels at stage2 expected
- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
+ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
* in normal translations(e.g, stage1), since we cannot have another level in
- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
+ * the range (IPA_SHIFT, IPA_SHIFT - 4).
*/
-#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
+#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr)
-/*
- * With all the supported VA_BITs and 40bit guest IPA, the following condition
- * is always true:
- *
- * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
- *
- * We base our stage-2 page table walker helpers on this assumption and
- * fall back to using the host version of the helper wherever possible.
- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
- * to using the host version, since it is guaranteed it is not folded at host.
- *
- * If the condition breaks in the future, we can rearrange the host level
- * definitions and reuse them for stage2. Till then...
- */
-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
-#error "Unsupported combination of guest IPA and host VA_BITS."
-#endif
-
-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
-#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
-#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT)
-#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1))
+/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */
+#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
+#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm))
+#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
* The number of PTRS across all concatenated stage2 tables given by the
* number of bits resolved at the initial level.
+ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
+ * in which case, stage2_pgd_ptrs will have one entry.
*/
-#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
+#define pgd_ptrs_shift(ipa, pgdir_shift) \
+ ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
+#define __s2_pgd_ptrs(ipa, lvls) \
+ (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
+#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
+
+#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
+#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD.
+ * kvm_mmmu_cache_min_pages() is the number of pages required to install
+ * a stage-2 translation. We pre-allocate the entry level page table at
+ * the VM creation.
*/
-#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1)
+#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
-
-#if STAGE2_PGTABLE_LEVELS > 3
+/* Stage2 PUD definitions when the level is present */
+static inline bool kvm_stage2_has_pud(struct kvm *kvm)
+{
+ return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
+}
#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-#define stage2_pgd_none(pgd) pgd_none(pgd)
-#define stage2_pgd_clear(pgd) pgd_clear(pgd)
-#define stage2_pgd_present(pgd) pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address)
-#define stage2_pud_free(pud) pud_free(NULL, pud)
+static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return pgd_none(pgd);
+ else
+ return 0;
+}
-#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp)
+static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pgd_clear(pgdp);
+}
-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
{
- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+ if (kvm_stage2_has_pud(kvm))
+ return pgd_present(pgd);
+ else
+ return 1;
+}
- return (boundary - 1 < end - 1) ? boundary : end;
+static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pgd_populate(NULL, pgd, pud);
+}
+
+static inline pud_t *stage2_pud_offset(struct kvm *kvm,
+ pgd_t *pgd, unsigned long address)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return pud_offset(pgd, address);
+ else
+ return (pud_t *)pgd;
}
-#endif /* STAGE2_PGTABLE_LEVELS > 3 */
+static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pud_free(NULL, pud);
+}
+static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return kvm_page_empty(pudp);
+ else
+ return false;
+}
-#if STAGE2_PGTABLE_LEVELS > 2
+static inline phys_addr_t
+stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ if (kvm_stage2_has_pud(kvm)) {
+ phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+
+ return (boundary - 1 < end - 1) ? boundary : end;
+ } else {
+ return end;
+ }
+}
+
+/* Stage2 PMD definitions when the level is present */
+static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
+{
+ return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
+}
#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT)
+#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
-#define stage2_pud_none(pud) pud_none(pud)
-#define stage2_pud_clear(pud) pud_clear(pud)
-#define stage2_pud_present(pud) pud_present(pud)
-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address)
-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd)
+static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_none(pud);
+ else
+ return 0;
+}
+
+static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ pud_clear(pud);
+}
-#define stage2_pud_huge(pud) pud_huge(pud)
-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_present(pud);
+ else
+ return 1;
+}
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
{
- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
+ if (kvm_stage2_has_pmd(kvm))
+ pud_populate(NULL, pud, pmd);
+}
- return (boundary - 1 < end - 1) ? boundary : end;
+static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
+ pud_t *pud, unsigned long address)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pmd_offset(pud, address);
+ else
+ return (pmd_t *)pud;
}
-#endif /* STAGE2_PGTABLE_LEVELS > 2 */
+static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ pmd_free(NULL, pmd);
+}
+
+static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_huge(pud);
+ else
+ return 0;
+}
+
+static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return kvm_page_empty(pmdp);
+ else
+ return 0;
+}
-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ if (kvm_stage2_has_pmd(kvm)) {
+ phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-#if STAGE2_PGTABLE_LEVELS == 2
-#include <asm/stage2_pgtable-nopmd.h>
-#elif STAGE2_PGTABLE_LEVELS == 3
-#include <asm/stage2_pgtable-nopud.h>
-#endif
+ return (boundary - 1 < end - 1) ? boundary : end;
+ } else {
+ return end;
+ }
+}
+static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
+{
+ return kvm_page_empty(ptep);
+}
-#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
+{
+ return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
+}
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
- phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
+ phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
return (boundary - 1 < end - 1) ? boundary : end;
}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 07256b08226c..a74f84d09412 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -338,15 +338,15 @@ int __attribute_const__ kvm_target_cpu(void)
return KVM_ARM_TARGET_CORTEX_A53;
case ARM_CPU_PART_CORTEX_A57:
return KVM_ARM_TARGET_CORTEX_A57;
- };
+ }
break;
case ARM_CPU_IMP_APM:
switch (part_number) {
case APM_CPU_PART_POTENZA:
return KVM_ARM_TARGET_XGENE_POTENZA;
- };
+ }
break;
- };
+ }
/* Return a default generic target */
return KVM_ARM_TARGET_GENERIC_V8;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e5e741bfffe1..35a81bebd02b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
*/
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
return 0;
+ case ARM_EXCEPTION_IL:
+ /*
+ * We attempted an illegal exception return. Guest state must
+ * have been corrupted somehow. Give up.
+ */
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ return -EINVAL;
default:
kvm_pr_unimpl("Unsupported exception type: %d",
exception_index);
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 2fabc2dc1966..82d1904328ad 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
# KVM code is run at a different exception code with a different map, so
# compiler instrumentation that inserts callbacks or checks into the code may
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 24b4fbafe3e4..b1f14f736962 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -162,6 +162,20 @@ el1_error:
mov x0, #ARM_EXCEPTION_EL1_SERROR
b __guest_exit
+el2_sync:
+ /* Check for illegal exception return, otherwise panic */
+ mrs x0, spsr_el2
+
+ /* if this was something else, then panic! */
+ tst x0, #PSR_IL_BIT
+ b.eq __hyp_panic
+
+ /* Let's attempt a recovery from the illegal exception return */
+ get_vcpu_ptr x1, x0
+ mov x0, #ARM_EXCEPTION_IL
+ b __guest_exit
+
+
el2_error:
ldp x0, x1, [sp], #16
@@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector)
invalid_vect el2t_fiq_invalid // FIQ EL2t
invalid_vect el2t_error_invalid // Error EL2t
- invalid_vect el2h_sync_invalid // Synchronous EL2h
+ valid_vect el2_sync // Synchronous EL2h
invalid_vect el2h_irq_invalid // IRQ EL2h
invalid_vect el2h_fiq_invalid // FIQ EL2h
valid_vect el2_error // Error EL2h
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
deleted file mode 100644
index 603e1ee83e89..000000000000
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/types.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_hyp.h>
-
-u32 __hyp_text __init_stage2_translation(void)
-{
- u64 val = VTCR_EL2_FLAGS;
- u64 parange;
- u64 tmp;
-
- /*
- * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
- * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
- * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
- */
- parange = read_sysreg(id_aa64mmfr0_el1) & 7;
- if (parange > ID_AA64MMFR0_PARANGE_MAX)
- parange = ID_AA64MMFR0_PARANGE_MAX;
- val |= parange << 16;
-
- /* Compute the actual PARange... */
- switch (parange) {
- case 0:
- parange = 32;
- break;
- case 1:
- parange = 36;
- break;
- case 2:
- parange = 40;
- break;
- case 3:
- parange = 42;
- break;
- case 4:
- parange = 44;
- break;
- case 5:
- default:
- parange = 48;
- break;
- }
-
- /*
- * ... and clamp it to 40 bits, unless we have some braindead
- * HW that implements less than that. In all cases, we'll
- * return that value for the rest of the kernel to decide what
- * to do.
- */
- val |= 64 - (parange > 40 ? 40 : parange);
-
- /*
- * Check the availability of Hardware Access Flag / Dirty Bit
- * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
- */
- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
- if (tmp)
- val |= VTCR_EL2_HA;
-
- /*
- * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
- * bit in VTCR_EL2.
- */
- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf;
- val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ?
- VTCR_EL2_VS_16BIT :
- VTCR_EL2_VS_8BIT;
-
- write_sysreg(val, vtcr_el2);
-
- return parange;
-}
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index ca46153d7915..7cc175c88a37 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void)
static void __hyp_text __activate_vm(struct kvm *kvm)
{
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
}
static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
return false; /* Translation failed, back to guest */
/* Convert PAR to HPFAR format */
- *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
+ *hpfar = PAR_TO_HPFAR(tmp);
return true;
}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 9ce223944983..8dc285318204 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
static void __hyp_text
__sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
{
+ u64 pstate = ctxt->gp_regs.regs.pstate;
+ u64 mode = pstate & PSR_AA32_MODE_MASK;
+
+ /*
+ * Safety check to ensure we're setting the CPU up to enter the guest
+ * in a less privileged mode.
+ *
+ * If we are attempting a return to EL2 or higher in AArch64 state,
+ * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
+ * we'll take an illegal exception state exception immediately after
+ * the ERET to the guest. Attempts to return to AArch32 Hyp will
+ * result in an illegal exception return because EL2's execution state
+ * is determined by SCR_EL3.RW.
+ */
+ if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
+ pstate = PSR_MODE_EL2h | PSR_IL_BIT;
+
write_sysreg_el2(ctxt->gp_regs.regs.pc, elr);
- write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
+ write_sysreg_el2(pstate, spsr);
if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 131c7772703c..4dbd9c69a96d 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
* bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
* let's flip TGE before executing the TLB operation.
*/
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
val = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
write_sysreg(val, hcr_el2);
@@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm)
{
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
isb();
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e37c78bbe1ca..b72a3dd56204 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -26,6 +26,7 @@
#include <kvm/arm_arch_timer.h>
+#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/ptrace.h>
#include <asm/kvm_arm.h>
@@ -33,6 +34,9 @@
#include <asm/kvm_coproc.h>
#include <asm/kvm_mmu.h>
+/* Maximum phys_shift supported for any VM on this host */
+static u32 kvm_ipa_limit;
+
/*
* ARMv8 Reset Values
*/
@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void)
}
/**
- * kvm_arch_dev_ioctl_check_extension
+ * kvm_arch_vm_ioctl_check_extension
*
* We currently assume that the number of HW registers is uniform
* across all CPUs (see cpuinfo_sanity_check).
*/
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_VCPU_ATTRIBUTES:
- case KVM_CAP_VCPU_EVENTS:
r = 1;
break;
+ case KVM_CAP_ARM_VM_IPA_SIZE:
+ r = kvm_ipa_limit;
+ break;
default:
r = 0;
}
@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset timer */
return kvm_timer_vcpu_reset(vcpu);
}
+
+void kvm_set_ipa_limit(void)
+{
+ unsigned int ipa_max, pa_max, va_max, parange;
+
+ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
+ pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+ /* Clamp the IPA limit to the PA size supported by the kernel */
+ ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
+ /*
+ * Since our stage2 table is dependent on the stage1 page table code,
+ * we must always honor the following condition:
+ *
+ * Number of levels in Stage1 >= Number of levels in Stage2.
+ *
+ * So clamp the ipa limit further down to limit the number of levels.
+ * Since we can concatenate upto 16 tables at entry level, we could
+ * go upto 4bits above the maximum VA addressible with the current
+ * number of levels.
+ */
+ va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
+ va_max += 4;
+
+ if (va_max < ipa_max)
+ ipa_max = va_max;
+
+ /*
+ * If the final limit is lower than the real physical address
+ * limit of the CPUs, report the reason.
+ */
+ if (ipa_max < pa_max)
+ pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
+ (va_max < pa_max) ? "Virtual" : "Physical");
+
+ WARN(ipa_max < KVM_PHYS_SHIFT,
+ "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
+ kvm_ipa_limit = ipa_max;
+ kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+}
+
+/*
+ * Configure the VTCR_EL2 for this VM. The VTCR value is common
+ * across all the physical CPUs on the system. We use system wide
+ * sanitised values to fill in different fields, except for Hardware
+ * Management of Access Flags. HA Flag is set unconditionally on
+ * all CPUs, as it is safe to run with or without the feature and
+ * the bit is RES0 on CPUs that don't support it.
+ */
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+ u64 vtcr = VTCR_EL2_FLAGS;
+ u32 parange, phys_shift;
+ u8 lvls;
+
+ if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+ return -EINVAL;
+
+ phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+ if (phys_shift) {
+ if (phys_shift > kvm_ipa_limit ||
+ phys_shift < 32)
+ return -EINVAL;
+ } else {
+ phys_shift = KVM_PHYS_SHIFT;
+ }
+
+ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
+ if (parange > ID_AA64MMFR0_PARANGE_MAX)
+ parange = ID_AA64MMFR0_PARANGE_MAX;
+ vtcr |= parange << VTCR_EL2_PS_SHIFT;
+
+ vtcr |= VTCR_EL2_T0SZ(phys_shift);
+ /*
+ * Use a minimum 2 level page table to prevent splitting
+ * host PMD huge pages at stage2.
+ */
+ lvls = stage2_pgtable_levels(phys_shift);
+ if (lvls < 2)
+ lvls = 2;
+ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+ /*
+ * Enable the Hardware Access Flag management, unconditionally
+ * on all CPUs. The features is RES0 on CPUs without the support
+ * and must be ignored by the CPUs.
+ */
+ vtcr |= VTCR_EL2_HA;
+
+ /* Set the vmid bits */
+ vtcr |= (kvm_get_vmid_bits() == 16) ?
+ VTCR_EL2_VS_16BIT :
+ VTCR_EL2_VS_8BIT;
+ kvm->arch.vtcr = vtcr;
+ return 0;
+}
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e203169931c7..6390bd8c141b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,6 +14,16 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+/*
+ * Exposed to assembly code for setting up initial page tables. Cannot be
+ * calculated in assembly code (fixmap entries are an enum), but is sanity
+ * checked in the actual fixmap C code to make sure that the fixmap is
+ * covered fully.
+ */
+#define FIXMAP_PMD_NUM 2
+/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
+#define FIXMAP_PMD_TOP 507
+
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
#include <asm/acpi.h>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09b2e3e2cf1b..55e51ff7e421 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -102,7 +102,15 @@
#define UNMAPPED_GVA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
-#define KVM_NR_PAGE_SIZES 3
+enum {
+ PT_PAGE_TABLE_LEVEL = 1,
+ PT_DIRECTORY_LEVEL = 2,
+ PT_PDPE_LEVEL = 3,
+ /* set max level to the biggest one */
+ PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL,
+};
+#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \
+ PT_PAGE_TABLE_LEVEL + 1)
#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
@@ -177,6 +185,7 @@ enum {
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
+#define DR6_BT (1 << 15)
#define DR6_RTM (1 << 16)
#define DR6_FIXED_1 0xfffe0ff0
#define DR6_INIT 0xffff0ff0
@@ -247,7 +256,7 @@ struct kvm_mmu_memory_cache {
* @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
*/
union kvm_mmu_page_role {
- unsigned word;
+ u32 word;
struct {
unsigned level:4;
unsigned cr4_pae:1;
@@ -273,6 +282,34 @@ union kvm_mmu_page_role {
};
};
+union kvm_mmu_extended_role {
+/*
+ * This structure complements kvm_mmu_page_role caching everything needed for
+ * MMU configuration. If nothing in both these structures changed, MMU
+ * re-configuration can be skipped. @valid bit is set on first usage so we don't
+ * treat all-zero structure as valid data.
+ */
+ u32 word;
+ struct {
+ unsigned int valid:1;
+ unsigned int execonly:1;
+ unsigned int cr0_pg:1;
+ unsigned int cr4_pse:1;
+ unsigned int cr4_pke:1;
+ unsigned int cr4_smap:1;
+ unsigned int cr4_smep:1;
+ unsigned int cr4_la57:1;
+ };
+};
+
+union kvm_mmu_role {
+ u64 as_u64;
+ struct {
+ union kvm_mmu_page_role base;
+ union kvm_mmu_extended_role ext;
+ };
+};
+
struct kvm_rmap_head {
unsigned long val;
};
@@ -280,18 +317,18 @@ struct kvm_rmap_head {
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ bool unsync;
/*
* The following two entries are used to key the shadow page in the
* hash table.
*/
- gfn_t gfn;
union kvm_mmu_page_role role;
+ gfn_t gfn;
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
@@ -360,7 +397,7 @@ struct kvm_mmu {
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte);
hpa_t root_hpa;
- union kvm_mmu_page_role base_role;
+ union kvm_mmu_role mmu_role;
u8 root_level;
u8 shadow_root_level;
u8 ept_ad;
@@ -490,7 +527,7 @@ struct kvm_vcpu_hv {
struct kvm_hyperv_exit exit;
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
- cpumask_t tlb_lush;
+ cpumask_t tlb_flush;
};
struct kvm_vcpu_arch {
@@ -534,7 +571,13 @@ struct kvm_vcpu_arch {
* the paging mode of the l1 guest. This context is always used to
* handle faults.
*/
- struct kvm_mmu mmu;
+ struct kvm_mmu *mmu;
+
+ /* Non-nested MMU for L1 */
+ struct kvm_mmu root_mmu;
+
+ /* L1 MMU when running nested */
+ struct kvm_mmu guest_mmu;
/*
* Paging state of an L2 guest (used for nested npt)
@@ -585,6 +628,8 @@ struct kvm_vcpu_arch {
bool has_error_code;
u8 nr;
u32 error_code;
+ unsigned long payload;
+ bool has_payload;
u8 nested_apf;
} exception;
@@ -781,6 +826,9 @@ struct kvm_hv {
u64 hv_reenlightenment_control;
u64 hv_tsc_emulation_control;
u64 hv_tsc_emulation_status;
+
+ /* How many vCPUs have VP index != vCPU index */
+ atomic_t num_mismatched_vp_indexes;
};
enum kvm_irqchip_mode {
@@ -871,6 +919,7 @@ struct kvm_arch {
bool x2apic_broadcast_quirk_disabled;
bool guest_can_read_msr_platform_info;
+ bool exception_payload_enabled;
};
struct kvm_vm_stat {
@@ -1133,6 +1182,9 @@ struct kvm_x86_ops {
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
+
+ int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
};
struct kvm_arch_async_pf {
@@ -1170,7 +1222,6 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
@@ -1324,7 +1375,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ ulong roots_to_free);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c0643831706e..616f8e637bc3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
+void __init mem_encrypt_free_decrypted_mem(void);
bool sme_active(void);
bool sev_active(void);
+#define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
@@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+#define __bss_decrypted
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
@@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
#define __sme_pa(x) (__pa(x) | sme_me_mask)
#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ce2b59047cb8..9c85b54bf03c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,6 +14,7 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
+#include <asm/fixmap.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
@@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
-extern pte_t level1_fixmap_pgt[512];
+extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_top_pgt
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 0116b2ee9e64..449c92da2c91 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void)
*/
static inline void cpu_vmxoff(void)
{
- asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ asm volatile ("vmxoff");
cr4_clear_bits(X86_CR4_VMXE);
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9527ba5d62da..ade0f153947d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -503,19 +503,6 @@ enum vmcs_field {
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
-
-#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
struct vmx_msr_entry {
u32 index;
u32 reserved;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index fd23d5778ea1..dabfcf7c3941 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -288,6 +288,7 @@ struct kvm_reinject_control {
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
#define KVM_VCPUEVENT_VALID_SMM 0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010
/* Interrupt shadow states */
#define KVM_X86_SHADOW_INT_MOV_SS 0x01
@@ -299,7 +300,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 has_error_code;
- __u8 pad;
+ __u8 pending;
__u32 error_code;
} exception;
struct {
@@ -322,7 +323,9 @@ struct kvm_vcpu_events {
__u8 smm_inside_nmi;
__u8 latched_init;
} smi;
- __u32 reserved[9];
+ __u8 reserved[27];
+ __u8 exception_has_payload;
+ __u64 exception_payload;
};
/* for KVM_GET/SET_DEBUGREGS */
@@ -381,6 +384,7 @@ struct kvm_sync_regs {
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
+#define KVM_STATE_NESTED_EVMCS 0x00000004
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4e588f36228f..285eb3ec4200 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e)
e <= QOS_L3_MBM_LOCAL_EVENT_ID);
}
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
/**
* struct rdt_resource - attributes of an RDT resource
* @rid: The index of the resource
@@ -423,16 +428,19 @@ struct rdt_resource {
struct rdt_cache cache;
struct rdt_membw membw;
const char *format_str;
- int (*parse_ctrlval) (void *data, struct rdt_resource *r,
- struct rdt_domain *d);
+ int (*parse_ctrlval)(struct rdt_parse_data *data,
+ struct rdt_resource *r,
+ struct rdt_domain *d);
struct list_head evt_list;
int num_rmid;
unsigned int mon_scale;
unsigned long fflags;
};
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int update_domains(struct rdt_resource *r, int closid);
+int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index af358ca05160..0f53049719cd 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- unsigned long data;
- char *buf = _buf;
+ unsigned long bw_val;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
}
- if (!bw_validate(buf, &data, r))
+ if (!bw_validate(data->buf, &bw_val, r))
return -EINVAL;
- d->new_ctrl = data;
+ d->new_ctrl = bw_val;
d->have_new_ctrl = true;
return 0;
@@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
-struct rdt_cbm_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- struct rdt_cbm_parse_data *data = _data;
struct rdtgroup *rdtgrp = data->rdtgrp;
u32 cbm_val;
@@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
static int parse_line(char *line, struct rdt_resource *r,
struct rdtgroup *rdtgrp)
{
- struct rdt_cbm_parse_data data;
+ struct rdt_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
unsigned long dom_id;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ r->rid == RDT_RESOURCE_MBA) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
next:
if (!line || line[0] == '\0')
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index b799c00bef09..1b8e86a5d5e1 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...)
* limited as the number of resources grows.
*/
static int closid_free_map;
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
static void closid_init(void)
{
@@ -111,6 +117,7 @@ static void closid_init(void)
/* CLOSID 0 is always reserved for the default group */
closid_free_map &= ~1;
+ closid_free_map_len = rdt_min_closid;
}
static int closid_alloc(void)
@@ -802,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
sw_shareable = 0;
exclusive = 0;
seq_printf(seq, "%d=", dom->id);
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (!closid_allocated(i))
continue;
mode = rdtgroup_mode_by_closid(i);
@@ -989,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
/* Check for overlap with other resource groups */
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
ctrl_b = (unsigned long *)ctrl;
mode = rdtgroup_mode_by_closid(i);
if (closid_allocated(i) && i != closid &&
@@ -1024,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
struct rdt_resource *r;
+ bool has_cache = false;
struct rdt_domain *d;
for_each_alloc_enabled_rdt_resource(r) {
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ has_cache = true;
list_for_each_entry(d, &r->domains, list) {
if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
- rdtgrp->closid, false))
+ rdtgrp->closid, false)) {
+ rdt_last_cmd_puts("schemata overlaps\n");
return false;
+ }
}
}
+ if (!has_cache) {
+ rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
return true;
}
@@ -1085,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
rdtgrp->mode = RDT_MODE_SHAREABLE;
} else if (!strcmp(buf, "exclusive")) {
if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- rdt_last_cmd_printf("schemata overlaps\n");
ret = -EINVAL;
goto out;
}
@@ -1155,8 +1172,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
struct rdt_resource *r;
struct rdt_domain *d;
unsigned int size;
- bool sep = false;
- u32 cbm;
+ bool sep;
+ u32 ctrl;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -1174,6 +1191,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
}
for_each_alloc_enabled_rdt_resource(r) {
+ sep = false;
seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(d, &r->domains, list) {
if (sep)
@@ -1181,8 +1199,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
size = 0;
} else {
- cbm = d->ctrl_val[rdtgrp->closid];
- size = rdtgroup_cbm_to_size(r, d, cbm);
+ ctrl = (!is_mba_sc(r) ?
+ d->ctrl_val[rdtgrp->closid] :
+ d->mbps_val[rdtgrp->closid]);
+ if (r->rid == RDT_RESOURCE_MBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
}
seq_printf(s, "%d=%u", d->id, size);
sep = true;
@@ -2336,12 +2359,18 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
u32 *ctrl;
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
list_for_each_entry(d, &r->domains, list) {
d->have_new_ctrl = false;
d->new_ctrl = r->cache.shareable_bits;
used_b = r->cache.shareable_bits;
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (closid_allocated(i) && i != closid) {
mode = rdtgroup_mode_by_closid(i);
if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
@@ -2373,6 +2402,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
ret = update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("failed to initialize allocations\n");
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379e575a..ddee1f0870c4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -35,6 +35,7 @@
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
#include <asm/kasan.h>
+#include <asm/fixmap.h>
/*
* Manage page tables very early on.
@@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
+ unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pud[511] += load_delta;
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
- pmd[506] += load_delta;
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ pmd[i] += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr,
sme_encrypt_kernel(bp);
/*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 15ebc2fc166e..a3618cf04cf6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -24,6 +24,7 @@
#include "../entry/calling.h"
#include <asm/export.h>
#include <asm/nospec-branch.h>
+#include <asm/fixmap.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt)
KERNEL_IMAGE_SIZE/PMD_SIZE)
NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
NEXT_PAGE(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
.fill 512,8,0
+ .endr
#undef PMDS
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e6764648af3..013fe3d21dbb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/set_memory.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
@@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
static struct pvclock_vsyscall_time_info
- hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+static struct pvclock_vsyscall_time_info *hvclock_mem;
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
@@ -236,6 +238,45 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
+static void __init kvmclock_init_mem(void)
+{
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (sev_active()) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
+}
+
static int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
@@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
+
+ kvmclock_init_mem();
+
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);
@@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu)
/* Use the static page for the first CPUs, allocate otherwise */
if (cpu < HVC_BOOT_ARRAY_SIZE)
p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
else
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ return -ENOMEM;
per_cpu(hv_clock_per_cpu, cpu) = p;
return p ? 0 : -ENOMEM;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index afdb303285f8..8dc69d82567e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf,
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
@@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a419f86..5dd3317d761f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -65,6 +65,23 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+
#else
#define X86_ALIGN_RODATA_BEGIN
@@ -74,6 +91,7 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
#endif
@@ -355,6 +373,7 @@ SECTIONS
__bss_start = .;
*(.bss..page_aligned)
*(.bss)
+ BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01d209ab5481..4e80080f277a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -36,6 +36,8 @@
#include "trace.h"
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
+
static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
{
return atomic64_read(&synic->sint[sint]);
@@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
struct kvm_vcpu *vcpu = NULL;
int i;
- if (vpidx < KVM_MAX_VCPUS)
- vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
stimer_cleanup(&hv_vcpu->stimer[i]);
}
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
+
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page)
+{
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return false;
+ return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ assist_page, sizeof(*assist_page));
+}
+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
+
static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
{
struct hv_message *msg = &stimer->msg;
@@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void)
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
switch (msr) {
- case HV_X64_MSR_VP_INDEX:
- if (!host)
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ int vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
return 1;
- hv->vp_index = (u32)data;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
break;
+ }
case HV_X64_MSR_VP_ASSIST_PAGE: {
u64 gfn;
unsigned long addr;
if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
- hv->hv_vapic = data;
- if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
return 1;
break;
}
@@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
- if (__clear_user((void __user *)addr, PAGE_SIZE))
+
+ /*
+ * Clear apic_assist portion of f(struct hv_vp_assist_page
+ * only, there can be valuable data in the rest which needs
+ * to be preserved e.g. on migration.
+ */
+ if (__clear_user((void __user *)addr, sizeof(u32)))
return 1;
- hv->hv_vapic = data;
+ hv_vcpu->hv_vapic = data;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
if (kvm_lapic_enable_pv_eoi(vcpu,
- gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
return 1;
break;
}
@@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_VP_RUNTIME:
if (!host)
return 1;
- hv->runtime_offset = data - current_task_runtime_100ns();
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
break;
case HV_X64_MSR_SCONTROL:
case HV_X64_MSR_SVERSION:
@@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
bool host)
{
u64 data = 0;
- struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
switch (msr) {
case HV_X64_MSR_VP_INDEX:
- data = hv->vp_index;
+ data = hv_vcpu->vp_index;
break;
case HV_X64_MSR_EOI:
return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
@@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
case HV_X64_MSR_VP_ASSIST_PAGE:
- data = hv->hv_vapic;
+ data = hv_vcpu->hv_vapic;
break;
case HV_X64_MSR_VP_RUNTIME:
- data = current_task_runtime_100ns() + hv->runtime_offset;
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
break;
case HV_X64_MSR_SCONTROL:
case HV_X64_MSR_SVERSION:
@@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
+static __always_inline unsigned long *sparse_set_to_vcpu_mask(
+ struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
+ u64 *vp_bitmap, unsigned long *vcpu_bitmap)
{
- int i = 0, j;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ int i, bank, sbank = 0;
- if (!(valid_bank_mask & BIT_ULL(bank_no)))
- return -1;
+ memset(vp_bitmap, 0,
+ KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
+ vp_bitmap[bank] = sparse_banks[sbank++];
- for (j = 0; j < bank_no; j++)
- if (valid_bank_mask & BIT_ULL(j))
- i++;
+ if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
+ /* for all vcpus vp_index == vcpu_idx */
+ return (unsigned long *)vp_bitmap;
+ }
- return i;
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
+ (unsigned long *)vp_bitmap))
+ __set_bit(i, vcpu_bitmap);
+ }
+ return vcpu_bitmap;
}
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
u16 rep_cnt, bool ex)
{
struct kvm *kvm = current_vcpu->kvm;
- struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- struct kvm_vcpu *vcpu;
- unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
- unsigned long valid_bank_mask = 0;
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+ unsigned long *vcpu_mask;
+ u64 valid_bank_mask;
u64 sparse_banks[64];
- int sparse_banks_len, i;
+ int sparse_banks_len;
bool all_cpus;
if (!ex) {
@@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
trace_kvm_hv_flush_tlb(flush.processor_mask,
flush.address_space, flush.flags);
+ valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
} else {
@@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sparse_banks_len =
+ bitmap_weight((unsigned long *)&valid_bank_mask, 64) *
sizeof(sparse_banks[0]);
if (!sparse_banks_len && !all_cpus)
@@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
- cpumask_clear(&hv_current->tlb_lush);
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
- int bank = hv->vp_index / 64, sbank = 0;
-
- if (!all_cpus) {
- /* Banks >64 can't be represented */
- if (bank >= 64)
- continue;
-
- /* Non-ex hypercalls can only address first 64 vCPUs */
- if (!ex && bank)
- continue;
-
- if (ex) {
- /*
- * Check is the bank of this vCPU is in sparse
- * set and get the sparse bank number.
- */
- sbank = get_sparse_bank_no(valid_bank_mask,
- bank);
-
- if (sbank < 0)
- continue;
- }
-
- if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
- continue;
- }
+ cpumask_clear(&hv_vcpu->tlb_flush);
- /*
- * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
- * can't analyze it here, flush TLB regardless of the specified
- * address space.
- */
- __set_bit(i, vcpu_bitmap);
- }
+ vcpu_mask = all_cpus ? NULL :
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+ vp_bitmap, vcpu_bitmap);
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+ * analyze it here, flush TLB regardless of the specified address space.
+ */
kvm_make_vcpus_request_mask(kvm,
KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
- vcpu_bitmap, &hv_current->tlb_lush);
+ vcpu_mask, &hv_vcpu->tlb_flush);
ret_success:
/* We always do full TLB flush, set rep_done = rep_cnt. */
@@ -1370,6 +1407,99 @@ ret_success:
((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
+static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ unsigned long *vcpu_bitmap)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vector
+ };
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+ continue;
+
+ /* We fail only when APIC is disabled */
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+}
+
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+ bool ex, bool fast)
+{
+ struct kvm *kvm = current_vcpu->kvm;
+ struct hv_send_ipi_ex send_ipi_ex;
+ struct hv_send_ipi send_ipi;
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+ unsigned long *vcpu_mask;
+ unsigned long valid_bank_mask;
+ u64 sparse_banks[64];
+ int sparse_banks_len;
+ u32 vector;
+ bool all_cpus;
+
+ if (!ex) {
+ if (!fast) {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
+ sizeof(send_ipi))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = send_ipi.cpu_mask;
+ vector = send_ipi.vector;
+ } else {
+ /* 'reserved' part of hv_send_ipi should be 0 */
+ if (unlikely(ingpa >> 32 != 0))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = outgpa;
+ vector = (u32)ingpa;
+ }
+ all_cpus = false;
+ valid_bank_mask = BIT_ULL(0);
+
+ trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+ } else {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+ send_ipi_ex.vp_set.format,
+ send_ipi_ex.vp_set.valid_bank_mask);
+
+ vector = send_ipi_ex.vector;
+ valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+ sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sizeof(sparse_banks[0]);
+
+ all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+ if (!sparse_banks_len)
+ goto ret_success;
+
+ if (!all_cpus &&
+ kvm_read_guest(kvm,
+ ingpa + offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents),
+ sparse_banks,
+ sparse_banks_len))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ vcpu_mask = all_cpus ? NULL :
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+ vp_bitmap, vcpu_bitmap);
+
+ kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+
+ret_success:
+ return HV_STATUS_SUCCESS;
+}
+
bool kvm_hv_hypercall_enabled(struct kvm *kvm)
{
return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
@@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
break;
+ case HVCALL_SEND_IPI:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
+ break;
+ case HVCALL_SEND_IPI_EX:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
+ break;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index d6aa969e20f1..0e66c12ed2c3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page);
+
static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
int timer_index)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fbb0e6df121b..3cd227ff807f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,6 +70,11 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
+static bool lapic_timer_advance_adjust_done = false;
+#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+/* step-by-step approximation to mitigate fluctuation */
+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
map = rcu_dereference(kvm->arch.apic_map);
ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
- if (ret)
+ if (ret) {
+ *r = 0;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
continue;
- if (*r < 0)
- *r = 0;
*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
+ }
rcu_read_unlock();
return ret;
@@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
void wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u64 guest_tsc, tsc_deadline;
+ u64 guest_tsc, tsc_deadline, ns;
if (!lapic_in_kernel(vcpu))
return;
@@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline)
__delay(min(tsc_deadline - guest_tsc,
nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+
+ if (!lapic_timer_advance_adjust_done) {
+ /* too early */
+ if (guest_tsc < tsc_deadline) {
+ ns = (tsc_deadline - guest_tsc) * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ lapic_timer_advance_ns -= min((unsigned int)ns,
+ lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ } else {
+ /* too late */
+ ns = (guest_tsc - tsc_deadline) * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ lapic_timer_advance_ns += min((unsigned int)ns,
+ lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ }
+ if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+ lapic_timer_advance_adjust_done = true;
+ }
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
@@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
return 0;
}
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
{
u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+
if (!IS_ALIGNED(addr, 4))
return 1;
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
- addr, sizeof(u8));
+
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
}
void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ed0ed39abd36..ff6ef9c3d760 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
}
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_init(void);
void kvm_lapic_exit(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51b953ad9d4e..4cf43ce42959 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
if (!obj)
- return -ENOMEM;
+ return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = obj;
}
return 0;
@@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
if (!page)
- return -ENOMEM;
+ return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = page;
}
return 0;
@@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
mmu_free_pte_list_desc(desc);
}
-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
struct pte_list_desc *prev_desc;
int i;
if (!rmap_head->val) {
- printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+ pr_err("%s: %p 0->BUG\n", __func__, spte);
BUG();
} else if (!(rmap_head->val & 1)) {
- rmap_printk("pte_list_remove: %p 1->0\n", spte);
+ rmap_printk("%s: %p 1->0\n", __func__, spte);
if ((u64 *)rmap_head->val != spte) {
- printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
+ pr_err("%s: %p 1->BUG\n", __func__, spte);
BUG();
}
rmap_head->val = 0;
} else {
- rmap_printk("pte_list_remove: %p many->many\n", spte);
+ rmap_printk("%s: %p many->many\n", __func__, spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
prev_desc = desc;
desc = desc->more;
}
- pr_err("pte_list_remove: %p many->many\n", spte);
+ pr_err("%s: %p many->many\n", __func__, spte);
BUG();
}
}
+static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+{
+ mmu_spte_clear_track_bits(sptep);
+ __pte_list_remove(sptep, rmap_head);
+}
+
static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
@@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
- pte_list_remove(spte, rmap_head);
+ __pte_list_remove(spte, rmap_head);
}
/*
@@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
while ((sptep = rmap_get_first(rmap_head, &iter))) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
- drop_spte(kvm, sptep);
+ pte_list_remove(rmap_head, sptep);
flush = true;
}
@@ -1721,7 +1727,7 @@ restart:
need_flush = 1;
if (pte_write(*ptep)) {
- drop_spte(kvm, sptep);
+ pte_list_remove(rmap_head, sptep);
goto restart;
} else {
new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
@@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- pte_list_remove(parent_pte, &sp->parent_ptes);
+ __pte_list_remove(parent_pte, &sp->parent_ptes);
}
static void drop_parent_pte(struct kvm_mmu_page *sp,
@@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)
- || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+ || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int collisions = 0;
LIST_HEAD(invalid_list);
- role = vcpu->arch.mmu.base_role;
+ role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = direct;
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (!vcpu->arch.mmu.direct_map
- && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!vcpu->arch.mmu->direct_map
+ && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
@@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
{
iterator->addr = addr;
iterator->shadow_addr = root;
- iterator->level = vcpu->arch.mmu.shadow_root_level;
+ iterator->level = vcpu->arch.mmu->shadow_root_level;
if (iterator->level == PT64_ROOT_4LEVEL &&
- vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
- !vcpu->arch.mmu.direct_map)
+ vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->direct_map)
--iterator->level;
if (iterator->level == PT32E_ROOT_LEVEL) {
@@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
* prev_root is currently only used for 64-bit hosts. So only
* the active root_hpa is valid here.
*/
- BUG_ON(root != vcpu->arch.mmu.root_hpa);
+ BUG_ON(root != vcpu->arch.mmu->root_hpa);
iterator->shadow_addr
- = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
--iterator->level;
if (!iterator->shadow_addr)
@@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, u64 addr)
{
- shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
addr);
}
@@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
int emulate = 0;
gfn_t pseudo_gfn;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return 0;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -3310,7 +3316,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u64 spte = 0ull;
uint retry_count = 0;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return false;
if (!page_fault_can_be_fast(error_code))
@@ -3480,11 +3486,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
}
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ ulong roots_to_free)
{
int i;
LIST_HEAD(invalid_list);
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
@@ -3544,20 +3550,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
- if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, 0, 0,
- vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
+ vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = __pa(sp->spt);
- } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ vcpu->arch.mmu->root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
@@ -3570,9 +3576,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
}
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
} else
BUG();
@@ -3586,7 +3592,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
gfn_t root_gfn;
int i;
- root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+ root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
@@ -3595,8 +3601,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root_hpa;
MMU_WARN_ON(VALID_PAGE(root));
@@ -3606,11 +3612,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
+ vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = root;
+ vcpu->arch.mmu->root_hpa = root;
return 0;
}
@@ -3620,17 +3626,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+ if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
MMU_WARN_ON(VALID_PAGE(root));
- if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
+ if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
+ pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
if (!(pdptr & PT_PRESENT_MASK)) {
- vcpu->arch.mmu.pae_root[i] = 0;
+ vcpu->arch.mmu->pae_root[i] = 0;
continue;
}
root_gfn = pdptr >> PAGE_SHIFT;
@@ -3648,16 +3654,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ vcpu->arch.mmu->pae_root[i] = root | pm_mask;
}
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
/*
* If we shadow a 32 bit page table with a long mode page
* table we enter this path.
*/
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
- if (vcpu->arch.mmu.lm_root == NULL) {
+ if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu->lm_root == NULL) {
/*
* The additional page necessary for this is only
* allocated on demand.
@@ -3669,12 +3675,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (lm_root == NULL)
return 1;
- lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+ lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
- vcpu->arch.mmu.lm_root = lm_root;
+ vcpu->arch.mmu->lm_root = lm_root;
}
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
}
return 0;
@@ -3682,7 +3688,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.mmu.direct_map)
+ if (vcpu->arch.mmu->direct_map)
return mmu_alloc_direct_roots(vcpu);
else
return mmu_alloc_shadow_roots(vcpu);
@@ -3693,17 +3699,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
int i;
struct kvm_mmu_page *sp;
- if (vcpu->arch.mmu.direct_map)
+ if (vcpu->arch.mmu->direct_map)
return;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
+ if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root_hpa;
sp = page_header(root);
/*
@@ -3734,7 +3739,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
@@ -3808,7 +3813,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
goto exit;
walk_shadow_page_lockless_begin(vcpu);
@@ -3825,7 +3830,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
if (!is_shadow_present_pte(spte))
break;
- reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+ reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
iterator.level);
}
@@ -3904,7 +3909,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
struct kvm_shadow_walk_iterator iterator;
u64 spte;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return;
walk_shadow_page_lockless_begin(vcpu);
@@ -3931,7 +3936,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
if (r)
return r;
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
return nonpaging_map(vcpu, gva & PAGE_MASK,
@@ -3944,8 +3949,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
- arch.direct_map = vcpu->arch.mmu.direct_map;
- arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+ arch.direct_map = vcpu->arch.mmu->direct_map;
+ arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
@@ -4051,7 +4056,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
@@ -4127,7 +4132,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
{
uint i;
struct kvm_mmu_root_info root;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
root.cr3 = mmu->get_cr3(vcpu);
root.hpa = mmu->root_hpa;
@@ -4150,7 +4155,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
union kvm_mmu_page_role new_role,
bool skip_tlb_flush)
{
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
/*
* For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
@@ -4201,7 +4206,8 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
bool skip_tlb_flush)
{
if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
- kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
+ KVM_MMU_ROOT_CURRENT);
}
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
@@ -4219,7 +4225,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
static void inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ vcpu->arch.mmu->inject_page_fault(vcpu, fault);
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
@@ -4423,7 +4429,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
- bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+ bool uses_nx = context->nx ||
+ context->mmu_role.base.smep_andnot_wp;
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4562,7 +4569,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* SMAP:kernel-mode data accesses from user-mode
* mappings should fault. A fault is considered
* as a SMAP violation if all of the following
- * conditions are ture:
+ * conditions are true:
* - X86_CR4_SMAP is set in CR4
* - A user page is accessed
* - The access is not a fetch
@@ -4723,27 +4730,65 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
-static union kvm_mmu_page_role
-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_extended_role ext = {0};
+
+ ext.cr0_pg = !!is_paging(vcpu);
+ ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+ ext.cr4_pse = !!is_pse(vcpu);
+ ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
+ ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+
+ ext.valid = 1;
+
+ return ext;
+}
+
+static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
+ bool base_only)
+{
+ union kvm_mmu_role role = {0};
+
+ role.base.access = ACC_ALL;
+ role.base.nxe = !!is_nx(vcpu);
+ role.base.cr4_pae = !!is_pae(vcpu);
+ role.base.cr0_wp = is_write_protection(vcpu);
+ role.base.smm = is_smm(vcpu);
+ role.base.guest_mode = is_guest_mode(vcpu);
+
+ if (base_only)
+ return role;
+
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
+
+ return role;
+}
+
+static union kvm_mmu_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
{
- union kvm_mmu_page_role role = {0};
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
- role.guest_mode = is_guest_mode(vcpu);
- role.smm = is_smm(vcpu);
- role.ad_disabled = (shadow_accessed_mask == 0);
- role.level = kvm_x86_ops->get_tdp_level(vcpu);
- role.direct = true;
- role.access = ACC_ALL;
+ role.base.ad_disabled = (shadow_accessed_mask == 0);
+ role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
+ role.base.direct = true;
return role;
}
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ struct kvm_mmu *context = vcpu->arch.mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_tdp_mmu_root_page_role(vcpu, false);
- context->base_role.word = mmu_base_role_mask.word &
- kvm_calc_tdp_mmu_root_page_role(vcpu).word;
+ new_role.base.word &= mmu_base_role_mask.word;
+ if (new_role.as_u64 == context->mmu_role.as_u64)
+ return;
+
+ context->mmu_role.as_u64 = new_role.as_u64;
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -4783,36 +4828,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
-static union kvm_mmu_page_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
-{
- union kvm_mmu_page_role role = {0};
- bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
-
- role.nxe = is_nx(vcpu);
- role.cr4_pae = !!is_pae(vcpu);
- role.cr0_wp = is_write_protection(vcpu);
- role.smep_andnot_wp = smep && !is_write_protection(vcpu);
- role.smap_andnot_wp = smap && !is_write_protection(vcpu);
- role.guest_mode = is_guest_mode(vcpu);
- role.smm = is_smm(vcpu);
- role.direct = !is_paging(vcpu);
- role.access = ACC_ALL;
+static union kvm_mmu_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+{
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+
+ role.base.smep_andnot_wp = role.ext.cr4_smep &&
+ !is_write_protection(vcpu);
+ role.base.smap_andnot_wp = role.ext.cr4_smap &&
+ !is_write_protection(vcpu);
+ role.base.direct = !is_paging(vcpu);
if (!is_long_mode(vcpu))
- role.level = PT32E_ROOT_LEVEL;
+ role.base.level = PT32E_ROOT_LEVEL;
else if (is_la57_mode(vcpu))
- role.level = PT64_ROOT_5LEVEL;
+ role.base.level = PT64_ROOT_5LEVEL;
else
- role.level = PT64_ROOT_4LEVEL;
+ role.base.level = PT64_ROOT_4LEVEL;
return role;
}
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ struct kvm_mmu *context = vcpu->arch.mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+
+ new_role.base.word &= mmu_base_role_mask.word;
+ if (new_role.as_u64 == context->mmu_role.as_u64)
+ return;
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -4823,22 +4868,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
else
paging32_init_context(vcpu, context);
- context->base_role.word = mmu_base_role_mask.word &
- kvm_calc_shadow_mmu_root_page_role(vcpu).word;
+ context->mmu_role.as_u64 = new_role.as_u64;
reset_shadow_zero_bits_mask(vcpu, context);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-static union kvm_mmu_page_role
-kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+static union kvm_mmu_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+ bool execonly)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+ union kvm_mmu_role role;
+
+ /* Base role is inherited from root_mmu */
+ role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
+
+ role.base.level = PT64_ROOT_4LEVEL;
+ role.base.direct = false;
+ role.base.ad_disabled = !accessed_dirty;
+ role.base.guest_mode = true;
+ role.base.access = ACC_ALL;
- role.level = PT64_ROOT_4LEVEL;
- role.direct = false;
- role.ad_disabled = !accessed_dirty;
- role.guest_mode = true;
- role.access = ACC_ALL;
+ role.ext.execonly = execonly;
return role;
}
@@ -4846,11 +4897,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
- union kvm_mmu_page_role root_page_role =
- kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
+ struct kvm_mmu *context = vcpu->arch.mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+ execonly);
+
+ __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+
+ new_role.base.word &= mmu_base_role_mask.word;
+ if (new_role.as_u64 == context->mmu_role.as_u64)
+ return;
- __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
context->shadow_root_level = PT64_ROOT_4LEVEL;
context->nx = true;
@@ -4862,7 +4919,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->update_pte = ept_update_pte;
context->root_level = PT64_ROOT_4LEVEL;
context->direct_map = false;
- context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
+ context->mmu_role.as_u64 = new_role.as_u64;
+
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
update_last_nonleaf_level(vcpu, context);
@@ -4873,7 +4931,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ struct kvm_mmu *context = vcpu->arch.mmu;
kvm_init_shadow_mmu(vcpu);
context->set_cr3 = kvm_x86_ops->set_cr3;
@@ -4884,14 +4942,20 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
{
+ union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+ new_role.base.word &= mmu_base_role_mask.word;
+ if (new_role.as_u64 == g_context->mmu_role.as_u64)
+ return;
+
+ g_context->mmu_role.as_u64 = new_role.as_u64;
g_context->get_cr3 = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
/*
- * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
* L1's nested page tables (e.g. EPT12). The nested translation
* of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
* L2's page tables as the first level of translation and L1's
@@ -4930,10 +4994,10 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
if (reset_roots) {
uint i;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu->root_hpa = INVALID_PAGE;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
}
if (mmu_is_nested(vcpu))
@@ -4948,10 +5012,14 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
{
+ union kvm_mmu_role role;
+
if (tdp_enabled)
- return kvm_calc_tdp_mmu_root_page_role(vcpu);
+ role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
else
- return kvm_calc_shadow_mmu_root_page_role(vcpu);
+ role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+
+ return role.base;
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -4981,8 +5049,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4996,7 +5066,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
+ vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -5173,10 +5243,12 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
local_flush = true;
while (npte--) {
+ u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
+
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte);
if (gentry &&
- !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ !((sp->role.word ^ base_role)
& mmu_base_role_mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (need_remote_flush(entry, *spte))
@@ -5194,7 +5266,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- if (vcpu->arch.mmu.direct_map)
+ if (vcpu->arch.mmu->direct_map)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -5230,10 +5302,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
{
int r, emulation_type = 0;
enum emulation_result er;
- bool direct = vcpu->arch.mmu.direct_map;
+ bool direct = vcpu->arch.mmu->direct_map;
/* With shadow page tables, fault_address contains a GVA or nGPA. */
- if (vcpu->arch.mmu.direct_map) {
+ if (vcpu->arch.mmu->direct_map) {
vcpu->arch.gpa_available = true;
vcpu->arch.gpa_val = cr2;
}
@@ -5246,8 +5318,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
}
if (r == RET_PF_INVALID) {
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+ lower_32_bits(error_code),
+ false);
WARN_ON(r == RET_PF_INVALID);
}
@@ -5263,7 +5336,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* paging in both guests. If true, we simply unprotect the page
* and resume the guest.
*/
- if (vcpu->arch.mmu.direct_map &&
+ if (vcpu->arch.mmu->direct_map &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
return 1;
@@ -5311,7 +5384,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
int i;
/* INVLPG on a * non-canonical address is a NOP according to the SDM. */
@@ -5342,7 +5415,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
bool tlb_flush = false;
uint i;
@@ -5386,8 +5459,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu->pae_root);
+ free_page((unsigned long)vcpu->arch.mmu->lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5407,9 +5480,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
if (!page)
return -ENOMEM;
- vcpu->arch.mmu.pae_root = page_address(page);
+ vcpu->arch.mmu->pae_root = page_address(page);
for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
return 0;
}
@@ -5418,27 +5491,21 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
uint i;
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.mmu.translate_gpa = translate_gpa;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
- return alloc_mmu_pages(vcpu);
-}
+ vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-void kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
- /*
- * kvm_mmu_setup() is called only on vCPU initialization.
- * Therefore, no need to reset mmu roots as they are not yet
- * initialized.
- */
- kvm_init_mmu(vcpu, false);
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+ return alloc_mmu_pages(vcpu);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
@@ -5621,7 +5688,7 @@ restart:
if (sp->role.direct &&
!kvm_is_reserved_pfn(pfn) &&
PageTransCompoundMap(pfn_to_page(pfn))) {
- drop_spte(kvm, sptep);
+ pte_list_remove(rmap_head, sptep);
need_tlb_flush = 1;
goto restart;
}
@@ -5878,6 +5945,16 @@ int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
+ /*
+ * MMU roles use union aliasing which is, generally speaking, an
+ * undefined behavior. However, we supposedly know how compilers behave
+ * and the current status quo is unlikely to change. Guardians below are
+ * supposed to let us know if the assumption becomes false.
+ */
+ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+
kvm_mmu_reset_all_pte_masks();
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
@@ -5907,7 +5984,7 @@ out:
}
/*
- * Caculate mmu pages needed for kvm.
+ * Calculate mmu pages needed for kvm.
*/
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fab69c0b2f3..c7b333147c4a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,11 +43,6 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
-
static inline u64 rsvd_bits(int s, int e)
{
if (e < s)
@@ -80,7 +75,7 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
@@ -102,9 +97,9 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
{
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
- vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
- kvm_get_active_pcid(vcpu));
+ if (VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa |
+ kvm_get_active_pcid(vcpu));
}
/*
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1272861e77b9..abac7e208853 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -59,19 +59,19 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
int i;
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return;
- if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root_hpa;
sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
return;
}
for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
@@ -122,7 +122,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ "ent %llxn", vcpu->arch.mmu->root_level, pfn,
hpa, *sptep);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 14ffd973df54..7cf2185b7eb5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -158,14 +158,15 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
/* if accessed bit is not supported prefetch non accessed gpte */
- if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
+ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+ !(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
return false;
@@ -480,7 +481,7 @@ error:
static int FNAME(walk_addr)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gva_t addr, u32 access)
{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+ return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
}
@@ -509,7 +510,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
if (is_error_pfn(pfn))
@@ -604,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
direct_access = gw->pte_access;
- top_level = vcpu->arch.mmu.root_level;
+ top_level = vcpu->arch.mmu->root_level;
if (top_level == PT32E_ROOT_LEVEL)
top_level = PT32_ROOT_LEVEL;
/*
@@ -616,7 +617,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
goto out_gpte_changed;
for (shadow_walk_init(&it, vcpu, addr);
@@ -1004,7 +1005,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
&nr_present))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d96092b35936..f416f5c7f2ae 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -805,6 +805,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
+ kvm_deliver_exception_payload(&svm->vcpu);
+
if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
@@ -2918,18 +2920,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
kvm_init_shadow_mmu(vcpu);
- vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
- vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
- vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
- reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+ vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
static int nested_svm_check_permissions(struct vcpu_svm *svm)
@@ -2965,16 +2967,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_info_1 = error_code;
/*
- * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
- * The fix is to add the ancillary datum (CR2 or DR6) to structs
- * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
- * written only when inject_pending_event runs (DR6 would written here
- * too). This should be conditional on a new capability---if the
- * capability is disabled, kvm_multiple_exception would write the
- * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
*/
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ else if (svm->vcpu.arch.exception.has_payload)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
else
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
@@ -5638,26 +5637,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r13, %c[r13](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
/*
* Clear host registers marked as clobbered to prevent
* speculative use.
*/
- "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
- "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
- "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
- "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
-#ifdef CONFIG_X86_64
- "xor %%r8, %%r8 \n\t"
- "xor %%r9, %%r9 \n\t"
- "xor %%r10, %%r10 \n\t"
- "xor %%r11, %%r11 \n\t"
- "xor %%r12, %%r12 \n\t"
- "xor %%r13, %%r13 \n\t"
- "xor %%r14, %%r14 \n\t"
- "xor %%r15, %%r15 \n\t"
+ "xor %%r8d, %%r8d \n\t"
+ "xor %%r9d, %%r9d \n\t"
+ "xor %%r10d, %%r10d \n\t"
+ "xor %%r11d, %%r11d \n\t"
+ "xor %%r12d, %%r12d \n\t"
+ "xor %%r13d, %%r13d \n\t"
+ "xor %%r14d, %%r14d \n\t"
+ "xor %%r15d, %%r15d \n\t"
#endif
+ "xor %%ebx, %%ebx \n\t"
+ "xor %%ecx, %%ecx \n\t"
+ "xor %%edx, %%edx \n\t"
+ "xor %%esi, %%esi \n\t"
+ "xor %%edi, %%edi \n\t"
"pop %%" _ASM_BP
:
: [svm]"a"(svm),
@@ -7036,6 +7033,13 @@ failed:
return ret;
}
+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version)
+{
+ /* Intel-only feature */
+ return -ENODEV;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7165,6 +7169,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
+
+ .nested_enable_evmcs = nested_enable_evmcs,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0f997683404f..0659465a745c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
__entry->valid_bank_mask, __entry->format,
__entry->address_space, __entry->flags)
);
+
+/*
+ * Tracepoints for kvm_hv_send_ipi.
+ */
+TRACE_EVENT(kvm_hv_send_ipi,
+ TP_PROTO(u32 vector, u64 processor_mask),
+ TP_ARGS(vector, processor_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, processor_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->processor_mask = processor_mask;
+ ),
+
+ TP_printk("vector %x processor_mask 0x%llx",
+ __entry->vector, __entry->processor_mask)
+);
+
+TRACE_EVENT(kvm_hv_send_ipi_ex,
+ TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
+ TP_ARGS(vector, format, valid_bank_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, format)
+ __field(u64, valid_bank_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->format = format;
+ __entry->valid_bank_mask = valid_bank_mask;
+ ),
+
+ TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+ __entry->vector, __entry->format,
+ __entry->valid_bank_mask)
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 612fd17be635..641a65b30685 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -20,6 +20,7 @@
#include "mmu.h"
#include "cpuid.h"
#include "lapic.h"
+#include "hyperv.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
@@ -61,7 +62,7 @@
#define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
- ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+ ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -107,9 +108,12 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
-static bool __read_mostly nested = 0;
+static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
+static bool __read_mostly nested_early_check = 0;
+module_param(nested_early_check, bool, S_IRUGO);
+
static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
@@ -131,7 +135,7 @@ static bool __read_mostly enable_preemption_timer = 1;
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
-#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
@@ -187,6 +191,7 @@ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+extern const ulong vmx_early_consistency_check_return;
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@ -827,14 +832,28 @@ struct nested_vmx {
*/
struct vmcs12 *cached_shadow_vmcs12;
/*
- * Indicates if the shadow vmcs must be updated with the
- * data hold by vmcs12
+ * Indicates if the shadow vmcs or enlightened vmcs must be updated
+ * with the data held by struct vmcs12.
*/
- bool sync_shadow_vmcs;
+ bool need_vmcs12_sync;
bool dirty_vmcs12;
+ /*
+ * vmcs02 has been initialized, i.e. state that is constant for
+ * vmcs02 has been written to the backing VMCS. Initialization
+ * is delayed until L1 actually attempts to run a nested VM.
+ */
+ bool vmcs02_initialized;
+
bool change_vmcs01_virtual_apic_mode;
+ /*
+ * Enlightened VMCS has been enabled. It does not mean that L1 has to
+ * use it. However, VMX features available to L1 will be limited based
+ * on what the enlightened VMCS supports.
+ */
+ bool enlightened_vmcs_enabled;
+
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
@@ -870,6 +889,10 @@ struct nested_vmx {
/* in guest mode on SMM entry? */
bool guest_mode;
} smm;
+
+ gpa_t hv_evmcs_vmptr;
+ struct page *hv_evmcs_page;
+ struct hv_enlightened_vmcs *hv_evmcs;
};
#define POSTED_INTR_ON 0
@@ -1381,6 +1404,49 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
#define KVM_EVMCS_VERSION 1
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
+ PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
+ SECONDARY_EXEC_APIC_REGISTER_VIRT | \
+ SECONDARY_EXEC_ENABLE_PML | \
+ SECONDARY_EXEC_ENABLE_VMFUNC | \
+ SECONDARY_EXEC_SHADOW_VMCS | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
+
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -1473,69 +1539,12 @@ static void evmcs_load(u64 phys_addr)
static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
- /*
- * Enlightened VMCSv1 doesn't support these:
- *
- * POSTED_INTR_NV = 0x00000002,
- * GUEST_INTR_STATUS = 0x00000810,
- * APIC_ACCESS_ADDR = 0x00002014,
- * POSTED_INTR_DESC_ADDR = 0x00002016,
- * EOI_EXIT_BITMAP0 = 0x0000201c,
- * EOI_EXIT_BITMAP1 = 0x0000201e,
- * EOI_EXIT_BITMAP2 = 0x00002020,
- * EOI_EXIT_BITMAP3 = 0x00002022,
- */
- vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
- vmcs_conf->cpu_based_2nd_exec_ctrl &=
- ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
- vmcs_conf->cpu_based_2nd_exec_ctrl &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmcs_conf->cpu_based_2nd_exec_ctrl &=
- ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
-
- /*
- * GUEST_PML_INDEX = 0x00000812,
- * PML_ADDRESS = 0x0000200e,
- */
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
-
- /* VM_FUNCTION_CONTROL = 0x00002018, */
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
-
- /*
- * EPTP_LIST_ADDRESS = 0x00002024,
- * VMREAD_BITMAP = 0x00002026,
- * VMWRITE_BITMAP = 0x00002028,
- */
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
-
- /*
- * TSC_MULTIPLIER = 0x00002032,
- */
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+ vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
- /*
- * PLE_GAP = 0x00004020,
- * PLE_WINDOW = 0x00004022,
- */
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-
- /*
- * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
- */
- vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-
- /*
- * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
- * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
- */
- vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
- /*
- * Currently unsupported in KVM:
- * GUEST_IA32_RTIT_CTL = 0x00002814,
- */
}
/* check_ept_pointer() should be under protection of ept_pointer_lock. */
@@ -1560,7 +1569,8 @@ static void check_ept_pointer_match(struct kvm *kvm)
static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
{
- int ret;
+ struct kvm_vcpu *vcpu;
+ int ret = -ENOTSUPP, i;
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
@@ -1568,14 +1578,14 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
check_ept_pointer_match(kvm);
if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
- ret = -ENOTSUPP;
- goto out;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ ret |= hyperv_flush_guest_mapping(
+ to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer);
+ } else {
+ ret = hyperv_flush_guest_mapping(
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
}
- ret = hyperv_flush_guest_mapping(
- to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
-
-out:
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
return ret;
}
@@ -1591,6 +1601,35 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* We don't support disabling the feature for simplicity. */
+ if (vmx->nested.enlightened_vmcs_enabled)
+ return 0;
+
+ vmx->nested.enlightened_vmcs_enabled = true;
+
+ /*
+ * vmcs_version represents the range of supported Enlightened VMCS
+ * versions: lower 8 bits is the minimal version, higher 8 bits is the
+ * maximum supported version. KVM supports versions from 1 to
+ * KVM_EVMCS_VERSION.
+ */
+ if (vmcs_version)
+ *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
+
+ vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
+
+ return 0;
+}
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1613,11 +1652,6 @@ static inline bool is_page_fault(u32 intr_info)
return is_exception_n(intr_info, PF_VECTOR);
}
-static inline bool is_no_device(u32 intr_info)
-{
- return is_exception_n(intr_info, NM_VECTOR);
-}
-
static inline bool is_invalid_opcode(u32 intr_info)
{
return is_exception_n(intr_info, UD_VECTOR);
@@ -1628,12 +1662,6 @@ static inline bool is_gp_fault(u32 intr_info)
return is_exception_n(intr_info, GP_VECTOR);
}
-static inline bool is_external_interrupt(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
static inline bool is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -2059,9 +2087,6 @@ static inline bool is_nmi(u32 intr_info)
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
u32 exit_intr_info,
unsigned long exit_qualification);
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 reason, unsigned long qualification);
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
@@ -2073,7 +2098,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
return -1;
}
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
{
struct {
u64 vpid : 16;
@@ -2082,22 +2107,20 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
} operand = { vpid, 0, gva };
bool error;
- asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
- : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
- : "memory");
+ asm volatile (__ex("invvpid %2, %1") CC_SET(na)
+ : CC_OUT(na) (error) : "r"(ext), "m"(operand));
BUG_ON(error);
}
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
{
struct {
u64 eptp, gpa;
} operand = {eptp, gpa};
bool error;
- asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
- : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
- : "memory");
+ asm volatile (__ex("invept %2, %1") CC_SET(na)
+ : CC_OUT(na) (error) : "r"(ext), "m"(operand));
BUG_ON(error);
}
@@ -2116,9 +2139,8 @@ static void vmcs_clear(struct vmcs *vmcs)
u64 phys_addr = __pa(vmcs);
bool error;
- asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
- : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
- : "memory");
+ asm volatile (__ex("vmclear %1") CC_SET(na)
+ : CC_OUT(na) (error) : "m"(phys_addr));
if (unlikely(error))
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
@@ -2141,9 +2163,8 @@ static void vmcs_load(struct vmcs *vmcs)
if (static_branch_unlikely(&enable_evmcs))
return evmcs_load(phys_addr);
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
- : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
- : "memory");
+ asm volatile (__ex("vmptrld %1") CC_SET(na)
+ : CC_OUT(na) (error) : "m"(phys_addr));
if (unlikely(error))
printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
vmcs, phys_addr);
@@ -2319,8 +2340,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
- asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
- : "=a"(value) : "d"(field) : "cc");
+ asm volatile (__ex_clear("vmread %1, %0", "%k0")
+ : "=r"(value) : "r"(field));
return value;
}
@@ -2371,8 +2392,8 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
{
bool error;
- asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
- : CC_OUT(na) (error) : "a"(value), "d"(field));
+ asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
+ : CC_OUT(na) (error) : "r"(field), "rm"(value));
if (unlikely(error))
vmwrite_error(field, value);
}
@@ -2703,7 +2724,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
u64 guest_val, u64 host_val)
{
vmcs_write64(guest_val_vmcs, guest_val);
- vmcs_write64(host_val_vmcs, host_val);
+ if (host_val_vmcs != HOST_IA32_EFER)
+ vmcs_write64(host_val_vmcs, host_val);
vm_entry_controls_setbit(vmx, entry);
vm_exit_controls_setbit(vmx, exit);
}
@@ -2801,8 +2823,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
ignore_bits &= ~(u64)EFER_SCE;
#endif
- clear_atomic_switch_msr(vmx, MSR_EFER);
-
/*
* On EPT, we can't emulate NX, so we must switch EFER atomically.
* On CPUs that support "load IA32_EFER", always switch EFER
@@ -2815,8 +2835,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
} else {
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
@@ -3268,34 +3292,30 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
+ bool has_payload = vcpu->arch.exception.has_payload;
+ unsigned long payload = vcpu->arch.exception.payload;
if (nr == PF_VECTOR) {
if (vcpu->arch.exception.nested_apf) {
*exit_qual = vcpu->arch.apf.nested_apf_token;
return 1;
}
- /*
- * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
- * The fix is to add the ancillary datum (CR2 or DR6) to structs
- * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
- * can be written only when inject_pending_event runs. This should be
- * conditional on a new capability---if the capability is disabled,
- * kvm_multiple_exception would write the ancillary information to
- * CR2 or DR6, for backwards ABI-compatibility.
- */
if (nested_vmx_is_page_fault_vmexit(vmcs12,
vcpu->arch.exception.error_code)) {
- *exit_qual = vcpu->arch.cr2;
- return 1;
- }
- } else {
- if (vmcs12->exception_bitmap & (1u << nr)) {
- if (nr == DB_VECTOR)
- *exit_qual = vcpu->arch.dr6;
- else
- *exit_qual = 0;
+ *exit_qual = has_payload ? payload : vcpu->arch.cr2;
return 1;
}
+ } else if (vmcs12->exception_bitmap & (1u << nr)) {
+ if (nr == DB_VECTOR) {
+ if (!has_payload) {
+ payload = vcpu->arch.dr6;
+ payload &= ~(DR6_FIXED_1 | DR6_BT);
+ payload ^= DR6_RTM;
+ }
+ *exit_qual = payload;
+ } else
+ *exit_qual = 0;
+ return 1;
}
return 0;
@@ -3322,6 +3342,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
+ kvm_deliver_exception_payload(vcpu);
+
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -4393,9 +4415,7 @@ static void kvm_cpu_vmxon(u64 addr)
cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&addr), "m"(addr)
- : "memory", "cc");
+ asm volatile ("vmxon %0" : : "m"(addr));
}
static int hardware_enable(void)
@@ -4464,7 +4484,7 @@ static void vmclear_local_loaded_vmcss(void)
*/
static void kvm_cpu_vmxoff(void)
{
- asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+ asm volatile (__ex("vmxoff"));
intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
@@ -5108,9 +5128,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
bool invalidate_gpa)
{
if (enable_ept && (invalidate_gpa || !enable_vpid)) {
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
return;
- ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+ ept_sync_context(construct_eptp(vcpu,
+ vcpu->arch.mmu->root_hpa));
} else {
vpid_sync_context(vpid);
}
@@ -5260,7 +5281,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+ hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
@@ -6335,6 +6356,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_CR_PAT, low32, high32);
vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
}
+
+ if (cpu_has_load_ia32_efer)
+ vmcs_write64(HOST_IA32_EFER, host_efer);
}
static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -6662,7 +6686,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
- ASSERT(vmx->pml_pg);
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
@@ -8063,35 +8086,39 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
*/
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
{
vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+ return kvm_skip_emulated_instruction(vcpu);
}
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
{
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
| X86_EFLAGS_CF);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
- u32 vm_instruction_error)
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
{
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done there isn't a current VMCS.
- */
- nested_vmx_failInvalid(vcpu);
- return;
- }
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ return nested_vmx_failInvalid(vcpu);
+
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -8101,6 +8128,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
* We don't need to force a shadow sync because
* VM_INSTRUCTION_ERROR is not shadowed
*/
+ return kvm_skip_emulated_instruction(vcpu);
}
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
@@ -8288,6 +8316,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->nested.vpid02 = allocate_vpid();
+ vmx->nested.vmcs02_initialized = false;
vmx->nested.vmxon = true;
return 0;
@@ -8341,10 +8370,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (vmx->nested.vmxon) {
- nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (vmx->nested.vmxon)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
@@ -8363,21 +8391,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
* which replaces physical address width with 32
*/
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page)) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (is_error_page(page))
+ return nested_vmx_failInvalid(vcpu);
+
if (*(u32 *)kmap(page) != VMCS12_REVISION) {
kunmap(page);
kvm_release_page_clean(page);
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_failInvalid(vcpu);
}
kunmap(page);
kvm_release_page_clean(page);
@@ -8387,8 +8411,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
/*
@@ -8419,8 +8442,24 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
-static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->nested.hv_evmcs)
+ return;
+
+ kunmap(vmx->nested.hv_evmcs_page);
+ kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+ vmx->nested.hv_evmcs_vmptr = -1ull;
+ vmx->nested.hv_evmcs_page = NULL;
+ vmx->nested.hv_evmcs = NULL;
+}
+
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
if (vmx->nested.current_vmptr == -1ull)
return;
@@ -8428,16 +8467,18 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
- vmx->nested.sync_shadow_vmcs = false;
+ vmx->nested.need_vmcs12_sync = false;
vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Flush VMCS12 to guest memory */
- kvm_vcpu_write_guest_page(&vmx->vcpu,
+ kvm_vcpu_write_guest_page(vcpu,
vmx->nested.current_vmptr >> PAGE_SHIFT,
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
vmx->nested.current_vmptr = -1ull;
}
@@ -8445,8 +8486,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
* just stops using VMX.
*/
-static void free_nested(struct vcpu_vmx *vmx)
+static void free_nested(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
@@ -8479,6 +8522,10 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL;
}
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ nested_release_evmcs(vcpu);
+
free_loaded_vmcs(&vmx->nested.vmcs02);
}
@@ -8487,9 +8534,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
{
if (!nested_vmx_check_permission(vcpu))
return 1;
- free_nested(to_vmx(vcpu));
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ free_nested(vcpu);
+ return nested_vmx_succeed(vcpu);
}
/* Emulate the VMCLEAR instruction */
@@ -8505,25 +8551,28 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
- nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ return nested_vmx_failValid(vcpu,
+ VMXERR_VMCLEAR_INVALID_ADDRESS);
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_VMCLEAR_VMXON_POINTER);
- if (vmptr == vmx->nested.current_vmptr)
- nested_release_vmcs12(vmx);
+ if (vmx->nested.hv_evmcs_page) {
+ if (vmptr == vmx->nested.hv_evmcs_vmptr)
+ nested_release_evmcs(vcpu);
+ } else {
+ if (vmptr == vmx->nested.current_vmptr)
+ nested_release_vmcs12(vcpu);
- kvm_vcpu_write_guest(vcpu,
- vmptr + offsetof(struct vmcs12, launch_state),
- &zero, sizeof(zero));
+ kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
+ }
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -8606,6 +8655,395 @@ static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
}
+static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+{
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ vmcs12->hdr.revision_id = evmcs->revision_id;
+
+ /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
+ vmcs12->tpr_threshold = evmcs->tpr_threshold;
+ vmcs12->guest_rip = evmcs->guest_rip;
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
+ vmcs12->guest_rsp = evmcs->guest_rsp;
+ vmcs12->guest_rflags = evmcs->guest_rflags;
+ vmcs12->guest_interruptibility_info =
+ evmcs->guest_interruptibility_info;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+ vmcs12->cpu_based_vm_exec_control =
+ evmcs->cpu_based_vm_exec_control;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+ vmcs12->exception_bitmap = evmcs->exception_bitmap;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
+ vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
+ vmcs12->vm_entry_intr_info_field =
+ evmcs->vm_entry_intr_info_field;
+ vmcs12->vm_entry_exception_error_code =
+ evmcs->vm_entry_exception_error_code;
+ vmcs12->vm_entry_instruction_len =
+ evmcs->vm_entry_instruction_len;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+ vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
+ vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
+ vmcs12->host_cr0 = evmcs->host_cr0;
+ vmcs12->host_cr3 = evmcs->host_cr3;
+ vmcs12->host_cr4 = evmcs->host_cr4;
+ vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
+ vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
+ vmcs12->host_rip = evmcs->host_rip;
+ vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
+ vmcs12->host_es_selector = evmcs->host_es_selector;
+ vmcs12->host_cs_selector = evmcs->host_cs_selector;
+ vmcs12->host_ss_selector = evmcs->host_ss_selector;
+ vmcs12->host_ds_selector = evmcs->host_ds_selector;
+ vmcs12->host_fs_selector = evmcs->host_fs_selector;
+ vmcs12->host_gs_selector = evmcs->host_gs_selector;
+ vmcs12->host_tr_selector = evmcs->host_tr_selector;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+ vmcs12->pin_based_vm_exec_control =
+ evmcs->pin_based_vm_exec_control;
+ vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
+ vmcs12->secondary_vm_exec_control =
+ evmcs->secondary_vm_exec_control;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
+ vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
+ vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
+ vmcs12->msr_bitmap = evmcs->msr_bitmap;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
+ vmcs12->guest_es_base = evmcs->guest_es_base;
+ vmcs12->guest_cs_base = evmcs->guest_cs_base;
+ vmcs12->guest_ss_base = evmcs->guest_ss_base;
+ vmcs12->guest_ds_base = evmcs->guest_ds_base;
+ vmcs12->guest_fs_base = evmcs->guest_fs_base;
+ vmcs12->guest_gs_base = evmcs->guest_gs_base;
+ vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
+ vmcs12->guest_tr_base = evmcs->guest_tr_base;
+ vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
+ vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
+ vmcs12->guest_es_limit = evmcs->guest_es_limit;
+ vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
+ vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
+ vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
+ vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
+ vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
+ vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
+ vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
+ vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
+ vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
+ vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
+ vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
+ vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
+ vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
+ vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
+ vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
+ vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
+ vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
+ vmcs12->guest_es_selector = evmcs->guest_es_selector;
+ vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
+ vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
+ vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
+ vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
+ vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
+ vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
+ vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
+ vmcs12->tsc_offset = evmcs->tsc_offset;
+ vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
+ vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
+ vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
+ vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
+ vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
+ vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
+ vmcs12->guest_cr0 = evmcs->guest_cr0;
+ vmcs12->guest_cr3 = evmcs->guest_cr3;
+ vmcs12->guest_cr4 = evmcs->guest_cr4;
+ vmcs12->guest_dr7 = evmcs->guest_dr7;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
+ vmcs12->host_fs_base = evmcs->host_fs_base;
+ vmcs12->host_gs_base = evmcs->host_gs_base;
+ vmcs12->host_tr_base = evmcs->host_tr_base;
+ vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
+ vmcs12->host_idtr_base = evmcs->host_idtr_base;
+ vmcs12->host_rsp = evmcs->host_rsp;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
+ vmcs12->ept_pointer = evmcs->ept_pointer;
+ vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
+ }
+
+ if (unlikely(!(evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
+ vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
+ vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
+ vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
+ vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
+ vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
+ vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
+ vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
+ vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
+ vmcs12->guest_pending_dbg_exceptions =
+ evmcs->guest_pending_dbg_exceptions;
+ vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
+ vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
+ vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
+ vmcs12->guest_activity_state = evmcs->guest_activity_state;
+ vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+ }
+
+ /*
+ * Not used?
+ * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
+ * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
+ * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
+ * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
+ * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
+ * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
+ * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
+ * vmcs12->page_fault_error_code_mask =
+ * evmcs->page_fault_error_code_mask;
+ * vmcs12->page_fault_error_code_match =
+ * evmcs->page_fault_error_code_match;
+ * vmcs12->cr3_target_count = evmcs->cr3_target_count;
+ * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
+ * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
+ * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
+ */
+
+ /*
+ * Read only fields:
+ * vmcs12->guest_physical_address = evmcs->guest_physical_address;
+ * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
+ * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
+ * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
+ * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
+ * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
+ * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
+ * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
+ * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
+ * vmcs12->exit_qualification = evmcs->exit_qualification;
+ * vmcs12->guest_linear_address = evmcs->guest_linear_address;
+ *
+ * Not present in struct vmcs12:
+ * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
+ * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
+ * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
+ * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
+ */
+
+ return 0;
+}
+
+static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+{
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ /*
+ * Should not be changed by KVM:
+ *
+ * evmcs->host_es_selector = vmcs12->host_es_selector;
+ * evmcs->host_cs_selector = vmcs12->host_cs_selector;
+ * evmcs->host_ss_selector = vmcs12->host_ss_selector;
+ * evmcs->host_ds_selector = vmcs12->host_ds_selector;
+ * evmcs->host_fs_selector = vmcs12->host_fs_selector;
+ * evmcs->host_gs_selector = vmcs12->host_gs_selector;
+ * evmcs->host_tr_selector = vmcs12->host_tr_selector;
+ * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
+ * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
+ * evmcs->host_cr0 = vmcs12->host_cr0;
+ * evmcs->host_cr3 = vmcs12->host_cr3;
+ * evmcs->host_cr4 = vmcs12->host_cr4;
+ * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
+ * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
+ * evmcs->host_rip = vmcs12->host_rip;
+ * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
+ * evmcs->host_fs_base = vmcs12->host_fs_base;
+ * evmcs->host_gs_base = vmcs12->host_gs_base;
+ * evmcs->host_tr_base = vmcs12->host_tr_base;
+ * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
+ * evmcs->host_idtr_base = vmcs12->host_idtr_base;
+ * evmcs->host_rsp = vmcs12->host_rsp;
+ * sync_vmcs12() doesn't read these:
+ * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
+ * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
+ * evmcs->msr_bitmap = vmcs12->msr_bitmap;
+ * evmcs->ept_pointer = vmcs12->ept_pointer;
+ * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
+ * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
+ * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
+ * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
+ * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
+ * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
+ * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
+ * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
+ * evmcs->tpr_threshold = vmcs12->tpr_threshold;
+ * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
+ * evmcs->exception_bitmap = vmcs12->exception_bitmap;
+ * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
+ * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
+ * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
+ * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
+ * evmcs->page_fault_error_code_mask =
+ * vmcs12->page_fault_error_code_mask;
+ * evmcs->page_fault_error_code_match =
+ * vmcs12->page_fault_error_code_match;
+ * evmcs->cr3_target_count = vmcs12->cr3_target_count;
+ * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
+ * evmcs->tsc_offset = vmcs12->tsc_offset;
+ * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
+ * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
+ * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
+ * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
+ * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
+ * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
+ * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
+ * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+ *
+ * Not present in struct vmcs12:
+ * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
+ * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
+ * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
+ * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+ */
+
+ evmcs->guest_es_selector = vmcs12->guest_es_selector;
+ evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
+ evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
+ evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
+ evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
+ evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
+ evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
+ evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
+
+ evmcs->guest_es_limit = vmcs12->guest_es_limit;
+ evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
+ evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
+ evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
+ evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
+ evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
+ evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
+ evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
+ evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
+ evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
+
+ evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
+ evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
+ evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
+ evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
+ evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
+ evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
+ evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
+ evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
+
+ evmcs->guest_es_base = vmcs12->guest_es_base;
+ evmcs->guest_cs_base = vmcs12->guest_cs_base;
+ evmcs->guest_ss_base = vmcs12->guest_ss_base;
+ evmcs->guest_ds_base = vmcs12->guest_ds_base;
+ evmcs->guest_fs_base = vmcs12->guest_fs_base;
+ evmcs->guest_gs_base = vmcs12->guest_gs_base;
+ evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
+ evmcs->guest_tr_base = vmcs12->guest_tr_base;
+ evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
+ evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
+
+ evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
+ evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
+
+ evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
+ evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
+ evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
+ evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
+
+ evmcs->guest_pending_dbg_exceptions =
+ vmcs12->guest_pending_dbg_exceptions;
+ evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
+ evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
+
+ evmcs->guest_activity_state = vmcs12->guest_activity_state;
+ evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
+
+ evmcs->guest_cr0 = vmcs12->guest_cr0;
+ evmcs->guest_cr3 = vmcs12->guest_cr3;
+ evmcs->guest_cr4 = vmcs12->guest_cr4;
+ evmcs->guest_dr7 = vmcs12->guest_dr7;
+
+ evmcs->guest_physical_address = vmcs12->guest_physical_address;
+
+ evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
+ evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
+ evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
+ evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
+ evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
+ evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
+ evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
+ evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
+
+ evmcs->exit_qualification = vmcs12->exit_qualification;
+
+ evmcs->guest_linear_address = vmcs12->guest_linear_address;
+ evmcs->guest_rsp = vmcs12->guest_rsp;
+ evmcs->guest_rflags = vmcs12->guest_rflags;
+
+ evmcs->guest_interruptibility_info =
+ vmcs12->guest_interruptibility_info;
+ evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
+ evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
+ evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
+ evmcs->vm_entry_exception_error_code =
+ vmcs12->vm_entry_exception_error_code;
+ evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
+
+ evmcs->guest_rip = vmcs12->guest_rip;
+
+ evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+
+ return 0;
+}
+
/*
* Copy the writable VMCS shadow fields back to the VMCS12, in case
* they have been modified by the L1 guest. Note that the "read-only"
@@ -8679,20 +9117,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
vmcs_load(vmx->loaded_vmcs->vmcs);
}
-/*
- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
- * used before) all generate the same failure when it is missing.
- */
-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.current_vmptr == -1ull) {
- nested_vmx_failInvalid(vcpu);
- return 0;
- }
- return 1;
-}
-
static int handle_vmread(struct kvm_vcpu *vcpu)
{
unsigned long field;
@@ -8705,8 +9129,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_check_vmcs12(vcpu))
- return kvm_skip_emulated_instruction(vcpu);
+ if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+ return nested_vmx_failInvalid(vcpu);
if (!is_guest_mode(vcpu))
vmcs12 = get_vmcs12(vcpu);
@@ -8715,20 +9139,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
* When vmcs->vmcs_link_pointer is -1ull, any VMREAD
* to shadowed-field sets the ALU flags for VMfailInvalid.
*/
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+ return nested_vmx_failInvalid(vcpu);
vmcs12 = get_shadow_vmcs12(vcpu);
}
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
- if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
- nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
@@ -8746,8 +9168,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
(is_long_mode(vcpu) ? 8 : 4), NULL);
}
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
@@ -8772,8 +9193,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_check_vmcs12(vcpu))
- return kvm_skip_emulated_instruction(vcpu);
+ if (vmx->nested.current_vmptr == -1ull)
+ return nested_vmx_failInvalid(vcpu);
if (vmx_instruction_info & (1u << 10))
field_value = kvm_register_readl(vcpu,
@@ -8796,11 +9217,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
* VMCS," then the "read-only" fields are actually read/write.
*/
if (vmcs_field_readonly(field) &&
- !nested_cpu_has_vmwrite_any_field(vcpu)) {
- nested_vmx_failValid(vcpu,
+ !nested_cpu_has_vmwrite_any_field(vcpu))
+ return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- return kvm_skip_emulated_instruction(vcpu);
- }
if (!is_guest_mode(vcpu))
vmcs12 = get_vmcs12(vcpu);
@@ -8809,18 +9228,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
* When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
* to shadowed-field sets the ALU flags for VMfailInvalid.
*/
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+ return nested_vmx_failInvalid(vcpu);
vmcs12 = get_shadow_vmcs12(vcpu);
-
}
- if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
- nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
* Do not track vmcs12 dirty-state if in guest-mode
@@ -8842,8 +9257,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
}
}
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
@@ -8854,7 +9268,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->vmcs01.shadow_vmcs));
- vmx->nested.sync_shadow_vmcs = true;
+ vmx->nested.need_vmcs12_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
}
@@ -8871,36 +9285,37 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
- nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ return nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_INVALID_ADDRESS);
- if (vmptr == vmx->nested.vmxon_ptr) {
- nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_VMXON_POINTER);
+
+ /* Forbid normal VMPTRLD if Enlightened version was used */
+ if (vmx->nested.hv_evmcs)
+ return 1;
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
struct page *page;
page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page)) {
- nested_vmx_failInvalid(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ if (is_error_page(page))
+ return nested_vmx_failInvalid(vcpu);
+
new_vmcs12 = kmap(page);
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
kunmap(page);
kvm_release_page_clean(page);
- nested_vmx_failValid(vcpu,
+ return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- return kvm_skip_emulated_instruction(vcpu);
}
- nested_release_vmcs12(vmx);
+ nested_release_vmcs12(vcpu);
+
/*
* Load VMCS12 from guest memory since it is not already
* cached.
@@ -8912,8 +9327,71 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
set_current_vmptr(vmx, vmptr);
}
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
+}
+
+/*
+ * This is an equivalent of the nested hypervisor executing the vmptrld
+ * instruction.
+ */
+static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
+ bool from_launch)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_vp_assist_page assist_page;
+
+ if (likely(!vmx->nested.enlightened_vmcs_enabled))
+ return 1;
+
+ if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+ return 1;
+
+ if (unlikely(!assist_page.enlighten_vmentry))
+ return 1;
+
+ if (unlikely(assist_page.current_nested_vmcs !=
+ vmx->nested.hv_evmcs_vmptr)) {
+
+ if (!vmx->nested.hv_evmcs)
+ vmx->nested.current_vmptr = -1ull;
+
+ nested_release_evmcs(vcpu);
+
+ vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
+ vcpu, assist_page.current_nested_vmcs);
+
+ if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+ return 0;
+
+ vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+
+ if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+ nested_release_evmcs(vcpu);
+ return 0;
+ }
+
+ vmx->nested.dirty_vmcs12 = true;
+ /*
+ * As we keep L2 state for one guest only 'hv_clean_fields' mask
+ * can't be used when we switch between them. Reset it here for
+ * simplicity.
+ */
+ vmx->nested.hv_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+
+ /*
+ * Unlike normal vmcs12, enlightened vmcs12 is not fully
+ * reloaded from guest's memory (read only fields, fields not
+ * present in struct hv_enlightened_vmcs, ...). Make sure there
+ * are no leftovers.
+ */
+ if (from_launch)
+ memset(vmx->nested.cached_vmcs12, 0,
+ sizeof(*vmx->nested.cached_vmcs12));
+
+ }
+ return 1;
}
/* Emulate the VMPTRST instruction */
@@ -8928,6 +9406,9 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
+ if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+ return 1;
+
if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
@@ -8936,8 +9417,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
kvm_inject_page_fault(vcpu, &e);
return 1;
}
- nested_vmx_succeed(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
/* Emulate the INVEPT instruction */
@@ -8967,11 +9447,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
- if (type >= 32 || !(types & (1 << type))) {
- nested_vmx_failValid(vcpu,
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- return kvm_skip_emulated_instruction(vcpu);
- }
/* According to the Intel VMX instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
@@ -8993,14 +9471,20 @@ static int handle_invept(struct kvm_vcpu *vcpu)
case VMX_EPT_EXTENT_CONTEXT:
kvm_mmu_sync_roots(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- nested_vmx_succeed(vcpu);
break;
default:
BUG_ON(1);
break;
}
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
+}
+
+static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
}
static int handle_invvpid(struct kvm_vcpu *vcpu)
@@ -9014,6 +9498,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
u64 vpid;
u64 gla;
} operand;
+ u16 vpid02;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -9031,11 +9516,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
types = (vmx->nested.msrs.vpid_caps &
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
- if (type >= 32 || !(types & (1 << type))) {
- nested_vmx_failValid(vcpu,
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- return kvm_skip_emulated_instruction(vcpu);
- }
/* according to the intel vmx instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
@@ -9047,47 +9530,39 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
kvm_inject_page_fault(vcpu, &e);
return 1;
}
- if (operand.vpid >> 16) {
- nested_vmx_failValid(vcpu,
+ if (operand.vpid >> 16)
+ return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- return kvm_skip_emulated_instruction(vcpu);
- }
+ vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
if (!operand.vpid ||
- is_noncanonical_address(operand.gla, vcpu)) {
- nested_vmx_failValid(vcpu,
+ is_noncanonical_address(operand.gla, vcpu))
+ return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- return kvm_skip_emulated_instruction(vcpu);
- }
- if (cpu_has_vmx_invvpid_individual_addr() &&
- vmx->nested.vpid02) {
+ if (cpu_has_vmx_invvpid_individual_addr()) {
__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
- vmx->nested.vpid02, operand.gla);
+ vpid02, operand.gla);
} else
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
- if (!operand.vpid) {
- nested_vmx_failValid(vcpu,
+ if (!operand.vpid)
+ return nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- return kvm_skip_emulated_instruction(vcpu);
- }
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
default:
WARN_ON_ONCE(1);
return kvm_skip_emulated_instruction(vcpu);
}
- nested_vmx_succeed(vcpu);
-
- return kvm_skip_emulated_instruction(vcpu);
+ return nested_vmx_succeed(vcpu);
}
static int handle_invpcid(struct kvm_vcpu *vcpu)
@@ -9158,11 +9633,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
}
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
== operand.pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
- kvm_mmu_free_roots(vcpu, roots_to_free);
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
/*
* If neither the current cr3 nor any of the prev_roots use the
* given PCID, then nothing needs to be done here because a
@@ -9289,7 +9764,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
kvm_mmu_unload(vcpu);
mmu->ept_ad = accessed_dirty;
- mmu->base_role.ad_disabled = !accessed_dirty;
+ mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = address;
/*
* TODO: Check what's the correct approach in case
@@ -9648,9 +10123,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return false;
else if (is_page_fault(intr_info))
return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
- else if (is_no_device(intr_info) &&
- !(vmcs12->guest_cr0 & X86_CR0_TS))
- return false;
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -10672,9 +11144,25 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}
- if (vmx->nested.sync_shadow_vmcs) {
- copy_vmcs12_to_shadow(vmx);
- vmx->nested.sync_shadow_vmcs = false;
+ if (vmx->nested.need_vmcs12_sync) {
+ /*
+ * hv_evmcs may end up being not mapped after migration (when
+ * L2 was running), map it here to make sure vmcs12 changes are
+ * properly reflected.
+ */
+ if (vmx->nested.enlightened_vmcs_enabled &&
+ !vmx->nested.hv_evmcs)
+ nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+ if (vmx->nested.hv_evmcs) {
+ copy_vmcs12_to_enlightened(vmx);
+ /* All fields are clean */
+ vmx->nested.hv_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ } else {
+ copy_vmcs12_to_shadow(vmx);
+ }
+ vmx->nested.need_vmcs12_sync = false;
}
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -10741,7 +11229,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
"jmp 1f \n\t"
"2: \n\t"
- __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
@@ -10773,9 +11261,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
/* Enter guest mode */
"jne 1f \n\t"
- __ex(ASM_VMX_VMLAUNCH) "\n\t"
+ __ex("vmlaunch") "\n\t"
"jmp 2f \n\t"
- "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ "1: " __ex("vmresume") "\n\t"
"2: "
/* Save guest registers, load host registers, keep flags */
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
@@ -10797,6 +11285,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
+ /*
+ * Clear host registers marked as clobbered to prevent
+ * speculative use.
+ */
"xor %%r8d, %%r8d \n\t"
"xor %%r9d, %%r9d \n\t"
"xor %%r10d, %%r10d \n\t"
@@ -10954,6 +11446,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx->loaded_vmcs = vmcs;
vmx_vcpu_load(vcpu, cpu);
put_cpu();
+
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
+ vmx_segment_cache_clear(vmx);
}
/*
@@ -10962,12 +11458,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
*/
static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vcpu_load(vcpu);
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- free_nested(vmx);
- vcpu_put(vcpu);
+ vcpu_load(vcpu);
+ vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
+ free_nested(vcpu);
+ vcpu_put(vcpu);
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -11330,28 +11824,28 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
return get_vmcs12(vcpu)->ept_pointer;
}
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
- if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
- return 1;
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
nested_ept_ad_enabled(vcpu),
nested_ept_get_cr3(vcpu));
- vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
- vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
- vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+ vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
+ vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
+ vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+ vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
- return 0;
}
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -11768,15 +12262,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- u64 address = vmcs12->pml_address;
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
- if (!nested_cpu_has_ept(vmcs12) ||
- !IS_ALIGNED(address, 4096) ||
- address >> maxphyaddr)
- return -EINVAL;
- }
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !page_address_valid(vcpu, vmcs12->pml_address))
+ return -EINVAL;
return 0;
}
@@ -11956,112 +12447,87 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
return 0;
}
-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
- vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
- vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
- vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
- vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
- vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
- vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
- vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
- vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
- vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
- vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
- vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
- vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
- vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
- vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
- vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
- vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
- vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
- vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
- vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
- vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
- vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
- vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
- vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
- vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
- vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
- vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
- vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
- vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
- vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
- vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
-
- vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vmcs12->guest_pending_dbg_exceptions);
- vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
- vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+ return nested_cpu_has_ept(vmcs12) ||
+ (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
- if (nested_cpu_has_xsaves(vmcs12))
- vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+ return vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
+ else
+ return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
+}
- if (cpu_has_vmx_posted_intr())
- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
+{
+ /*
+ * If vmcs02 hasn't been initialized, set the constant vmcs02 state
+ * according to L0's settings (vmcs12 is irrelevant here). Host
+ * fields that come from L0 and are not constant, e.g. HOST_CR3,
+ * will be set as needed prior to VMLAUNCH/VMRESUME.
+ */
+ if (vmx->nested.vmcs02_initialized)
+ return;
+ vmx->nested.vmcs02_initialized = true;
/*
- * Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
- * Note that below we don't need special code to set EB.PF beyond the
- * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
- * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
- * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ * We don't care what the EPTP value is we just need to guarantee
+ * it's valid so we don't get a false positive when doing early
+ * consistency checks.
*/
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
+ if (enable_ept && nested_early_check)
+ vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
- if (cpu_has_vmx_apicv()) {
- vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
- vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
- vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
- vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
- }
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
- /*
- * Set host-state according to L0's settings (vmcs12 is irrelevant here)
- * Some constant fields are set here by vmx_set_constant_host_state().
- * Other fields are different per CPU, and will be set later when
- * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
- * is called.
- */
- vmx_set_constant_host_state(vmx);
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
+ if (enable_pml)
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
/*
- * Set the MSR load/store lists to match L0's settings.
+ * Set the MSR load/store lists to match L0's settings. Only the
+ * addresses are constant (for vmcs02), the counts can change based
+ * on L2's behavior, e.g. switching to/from long mode.
*/
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
- set_cr4_guest_host_mask(vmx);
+ vmx_set_constant_host_state(vmx);
+}
- if (kvm_mpx_supported()) {
- if (vmx->nested.nested_run_pending &&
- (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
- else
- vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
- }
+static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+ struct vmcs12 *vmcs12)
+{
+ prepare_vmcs02_constant_state(vmx);
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12069,78 +12535,30 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
}
-
- /*
- * L1 may access the L2's PDPTR, so save them to construct vmcs12
- */
- if (enable_ept) {
- vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
- vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
- vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
- vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
- }
-
- if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
}
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
- */
-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 *entry_failure_code)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control, vmcs12_exec_ctrl;
+ u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
- if (vmx->nested.dirty_vmcs12) {
- prepare_vmcs02_full(vcpu, vmcs12);
- vmx->nested.dirty_vmcs12 = false;
- }
+ if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+ prepare_vmcs02_early_full(vmx, vmcs12);
/*
- * First, the fields that are shadowed. This must be kept in sync
- * with vmx_shadow_fields.h.
+ * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+ * entry, but only if the current (host) sp changed from the value
+ * we wrote last (vmx->host_rsp). This cache is no longer relevant
+ * if we switch vmcs, and rather than hold a separate cache per vmcs,
+ * here we just force the write to happen on entry. host_rsp will
+ * also be written unconditionally by nested_vmx_check_vmentry_hw()
+ * if we are doing early consistency checks via hardware.
*/
+ vmx->host_rsp = 0;
- vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
- vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
- vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
- vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
- vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
-
- if (vmx->nested.nested_run_pending &&
- (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
- kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
- } else {
- kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
- }
- if (vmx->nested.nested_run_pending) {
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- vmcs12->vm_entry_intr_info_field);
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs12->vm_entry_exception_error_code);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs12->vm_entry_instruction_len);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs12->guest_interruptibility_info);
- vmx->loaded_vmcs->nmi_known_unmasked =
- !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
- } else {
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
- }
- vmx_set_rflags(vcpu, vmcs12->guest_rflags);
-
+ /*
+ * PIN CONTROLS
+ */
exec_control = vmcs12->pin_based_vm_exec_control;
/* Preemption timer setting is computed directly in vmx_vcpu_run. */
@@ -12155,13 +12573,43 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
-
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
- vmx->nested.preemption_timer_expired = false;
- if (nested_cpu_has_preemption_timer(vmcs12))
- vmx_start_preemption_timer(vcpu);
+ /*
+ * EXEC CONTROLS
+ */
+ exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+ * nested_get_vmcs12_pages can't fix it up, the illegal value
+ * will result in a VM entry failure.
+ */
+ if (exec_control & CPU_BASED_TPR_SHADOW) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ } else {
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
+ }
+
+ /*
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
+ */
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ /*
+ * SECONDARY EXEC CONTROLS
+ */
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx->secondary_exec_control;
@@ -12202,43 +12650,214 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
/*
- * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
- * entry, but only if the current (host) sp changed from the value
- * we wrote last (vmx->host_rsp). This cache is no longer relevant
- * if we switch vmcs, and rather than hold a separate cache per vmcs,
- * here we just force the write to happen on entry.
+ * ENTRY CONTROLS
+ *
+ * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+ * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+ * on the related bits (if supported by the CPU) in the hope that
+ * we can avoid VMWrites during vmx_set_efer().
+ */
+ exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) &
+ ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ if (cpu_has_load_ia32_efer) {
+ if (guest_efer & EFER_LMA)
+ exec_control |= VM_ENTRY_IA32E_MODE;
+ if (guest_efer != host_efer)
+ exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+ }
+ vm_entry_controls_init(vmx, exec_control);
+
+ /*
+ * EXIT CONTROLS
+ *
+ * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
- vmx->host_rsp = 0;
+ exec_control = vmcs_config.vmexit_ctrl;
+ if (cpu_has_load_ia32_efer && guest_efer != host_efer)
+ exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ vm_exit_controls_init(vmx, exec_control);
- exec_control = vmx_exec_control(vmx); /* L0's desires */
- exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- exec_control &= ~CPU_BASED_TPR_SHADOW;
- exec_control |= vmcs12->cpu_based_vm_exec_control;
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit and never change
+ * the PML address (once set), this happens to be equivalent to
+ * simply resetting the index in vmcs02.
+ */
+ if (enable_pml)
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
/*
- * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
- * nested_get_vmcs12_pages can't fix it up, the illegal value
- * will result in a VM entry failure.
+ * Interrupt/Exception Fields
*/
- if (exec_control & CPU_BASED_TPR_SHADOW) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
- vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ if (vmx->nested.nested_run_pending) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING;
-#endif
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
+}
+
+static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+ }
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct
+ * vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
}
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+
/*
- * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
- * for I/O port accesses.
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+ * If enable_ept, L0 doesn't care about page faults and we should
+ * set all of these to L1's desires. However, if !enable_ept, L0 does
+ * care about (at least some) page faults, and because it is not easy
+ * (if at all possible?) to merge L0 and L1's desires, we simply ask
+ * to exit on each and every L2 page fault. This is done by setting
+ * MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
*/
- exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
- exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ enable_ept ? vmcs12->page_fault_error_code_match : 0);
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (kvm_mpx_supported()) {
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ else
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ }
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+
+ if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
+ prepare_vmcs02_full(vmx, vmcs12);
+ vmx->nested.dirty_vmcs12 = false;
+ }
+
+ /*
+ * First, the fields that are shadowed. This must be kept in sync
+ * with vmx_shadow_fields.h.
+ */
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ }
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
* bitwise-or of what L1 wants to trap for L2, and what we want to
@@ -12248,20 +12867,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
- * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
- * bits are further modified by vmx_set_efer() below.
- */
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
- * emulated by vmx_set_efer(), below.
- */
- vm_entry_controls_init(vmx,
- (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
- ~VM_ENTRY_IA32E_MODE) |
- (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
@@ -12284,37 +12889,29 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* influence global bitmap(for vpid01 and vpid02 allocation)
* even if spawn a lot of nested vCPUs.
*/
- if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
}
} else {
- vmx_flush_tlb(vcpu, true);
+ /*
+ * If L1 use EPT, then L0 needs to execute INVEPT on
+ * EPTP02 instead of EPTP01. Therefore, delay TLB
+ * flush until vmcs02->eptp is fully updated by
+ * KVM_REQ_LOAD_CR3. Note that this assumes
+ * KVM_REQ_TLB_FLUSH is evaluated after
+ * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
}
- if (enable_pml) {
- /*
- * Conceptually we want to copy the PML address and index from
- * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
- * since we always flush the log on each vmexit, this happens
- * to be equivalent to simply resetting the fields in vmcs02.
- */
- ASSERT(vmx->pml_pg);
- vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- }
-
- if (nested_cpu_has_ept(vmcs12)) {
- if (nested_ept_init_mmu_context(vcpu)) {
- *entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
- }
- } else if (nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ if (nested_cpu_has_ept(vmcs12))
+ nested_ept_init_mmu_context(vcpu);
+ else if (nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
vmx_flush_tlb(vcpu, true);
- }
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -12330,14 +12927,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- if (vmx->nested.nested_run_pending &&
- (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
- vcpu->arch.efer = vmcs12->guest_ia32_efer;
- else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
- vcpu->arch.efer |= (EFER_LMA | EFER_LME);
- else
- vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
- /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
+ /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);
/*
@@ -12379,6 +12970,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool ia32e;
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
@@ -12453,6 +13045,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
/*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+ }
+
+ /*
* From the Intel SDM, volume 3:
* Fields relevant to VM-entry event injection must be set properly.
* These fields are the VM-entry interruption-information field, the
@@ -12508,6 +13115,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
}
+ if (nested_cpu_has_ept(vmcs12) &&
+ !valid_ept_address(vcpu, vmcs12->ept_pointer))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
return 0;
}
@@ -12573,21 +13184,6 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 1;
}
- /*
- * If the load IA32_EFER VM-exit control is 1, bits reserved in the
- * IA32_EFER MSR must be 0 in the field for that register. In addition,
- * the values of the LMA and LME bits in the field must each be that of
- * the host address-space size VM-exit control.
- */
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
- ia32e = (vmcs12->vm_exit_controls &
- VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
- if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
- return 1;
- }
-
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
@@ -12596,26 +13192,139 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
+static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+ if (!nested_early_check)
+ return 0;
+
+ if (vmx->msr_autoload.host.nr)
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ if (vmx->msr_autoload.guest.nr)
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+ preempt_disable();
+
+ vmx_prepare_switch_to_guest(vcpu);
+
+ /*
+ * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+ * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
+ * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+ * there is no need to preserve other bits or save/restore the field.
+ */
+ vmcs_writel(GUEST_RFLAGS, 0);
+
+ vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
+
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ vmx->__launched = vmx->loaded_vmcs->launched;
+
+ asm(
+ /* Set HOST_RSP */
+ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
+ "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
+
+ /* Check if vmlaunch of vmresume is needed */
+ "cmpl $0, %c[launched](%0)\n\t"
+ "je 1f\n\t"
+ __ex("vmresume") "\n\t"
+ "jmp 2f\n\t"
+ "1: " __ex("vmlaunch") "\n\t"
+ "jmp 2f\n\t"
+ "2: "
+
+ /* Set vmx->fail accordingly */
+ "setbe %c[fail](%0)\n\t"
+
+ ".pushsection .rodata\n\t"
+ ".global vmx_early_consistency_check_return\n\t"
+ "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
+ ".popsection"
+ :
+ : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
+ : "rax", "cc", "memory"
+ );
+
+ vmcs_writel(HOST_RIP, vmx_return);
+
+ preempt_enable();
+
+ if (vmx->msr_autoload.host.nr)
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ if (vmx->msr_autoload.guest.nr)
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+ if (vmx->fail) {
+ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ vmx->fail = 0;
+ return 1;
+ }
+
+ /*
+ * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
+ */
+ local_irq_enable();
+ if (hw_breakpoint_active())
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
+
+ /*
+ * A non-failing VMEntry means we somehow entered guest mode with
+ * an illegal RIP, and that's just the tip of the iceberg. There
+ * is no telling what memory has been modified or what state has
+ * been exposed to unknown code. Hitting this all but guarantees
+ * a (very critical) hardware issue.
+ */
+ WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+ VMX_EXIT_REASONS_FAILED_VMENTRY));
+
+ return 0;
+}
+STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
+
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
/*
- * If exit_qual is NULL, this is being called from state restore (either RSM
+ * If from_vmentry is false, this is being called from state restore (either RSM
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
++ *
++ * Returns:
++ * 0 - success, i.e. proceed with actual VMEnter
++ * 1 - consistency check VMExit
++ * -1 - consistency check VMFail
*/
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
+static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- bool from_vmentry = !!exit_qual;
- u32 dummy_exit_qual;
bool evaluate_pending_interrupts;
- int r = 0;
+ u32 exit_reason = EXIT_REASON_INVALID_STATE;
+ u32 exit_qual;
evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- enter_guest_mode(vcpu);
-
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (kvm_mpx_supported() &&
@@ -12623,24 +13332,35 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- vmx_segment_cache_clear(vmx);
+ prepare_vmcs02_early(vmx, vmcs12);
+
+ if (from_vmentry) {
+ nested_get_vmcs12_pages(vcpu);
+
+ if (nested_vmx_check_vmentry_hw(vcpu)) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return -1;
+ }
+
+ if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ goto vmentry_fail_vmexit;
+ }
+
+ enter_guest_mode(vcpu);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
- r = EXIT_REASON_INVALID_STATE;
- if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
- goto fail;
+ if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+ goto vmentry_fail_vmexit_guest_mode;
if (from_vmentry) {
- nested_get_vmcs12_pages(vcpu);
-
- r = EXIT_REASON_MSR_LOAD_FAIL;
- *exit_qual = nested_vmx_load_msr(vcpu,
- vmcs12->vm_entry_msr_load_addr,
- vmcs12->vm_entry_msr_load_count);
- if (*exit_qual)
- goto fail;
+ exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+ exit_qual = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (exit_qual)
+ goto vmentry_fail_vmexit_guest_mode;
} else {
/*
* The MMU is not initialized to point at the right entities yet and
@@ -12677,12 +13397,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
*/
return 0;
-fail:
+ /*
+ * A failed consistency check that leads to a VMExit during L1's
+ * VMEnter to L2 is a variation of a normal VMexit, as explained in
+ * 26.7 "VM-entry failures during or after loading guest state".
+ */
+vmentry_fail_vmexit_guest_mode:
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
+
+vmentry_fail_vmexit:
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return r;
+
+ if (!from_vmentry)
+ return 1;
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->exit_qualification = exit_qual;
+ if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+ vmx->nested.need_vmcs12_sync = true;
+ return 1;
}
/*
@@ -12694,14 +13430,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
- u32 exit_qual;
int ret;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_check_vmcs12(vcpu))
- goto out;
+ if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
+ return 1;
+
+ if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+ return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -12711,13 +13449,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* rather than RFLAGS.ZF, and no error number is stored to the
* VM-instruction error field.
*/
- if (vmcs12->hdr.shadow_vmcs) {
- nested_vmx_failInvalid(vcpu);
- goto out;
- }
+ if (vmcs12->hdr.shadow_vmcs)
+ return nested_vmx_failInvalid(vcpu);
- if (enable_shadow_vmcs)
+ if (vmx->nested.hv_evmcs) {
+ copy_enlightened_to_vmcs12(vmx);
+ /* Enlightened VMCS doesn't have launch state */
+ vmcs12->launch_state = !launch;
+ } else if (enable_shadow_vmcs) {
copy_shadow_to_vmcs12(vmx);
+ }
/*
* The nested entry process starts with enforcing various prerequisites
@@ -12729,59 +13470,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* for misconfigurations which will anyway be caught by the processor
* when using the merged vmcs02.
*/
- if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
- nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
- goto out;
- }
+ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
- if (vmcs12->launch_state == launch) {
- nested_vmx_failValid(vcpu,
+ if (vmcs12->launch_state == launch)
+ return nested_vmx_failValid(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- goto out;
- }
ret = check_vmentry_prereqs(vcpu, vmcs12);
- if (ret) {
- nested_vmx_failValid(vcpu, ret);
- goto out;
- }
-
- /*
- * After this point, the trap flag no longer triggers a singlestep trap
- * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
- * This is not 100% correct; for performance reasons, we delegate most
- * of the checks on host state to the processor. If those fail,
- * the singlestep trap is missed.
- */
- skip_emulated_instruction(vcpu);
-
- ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
- if (ret) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, exit_qual);
- return 1;
- }
+ if (ret)
+ return nested_vmx_failValid(vcpu, ret);
/*
* We're finally done with prerequisite checking, and can start with
* the nested entry.
*/
-
vmx->nested.nested_run_pending = 1;
- ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
- if (ret) {
- nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
- vmx->nested.nested_run_pending = 0;
+ ret = nested_vmx_enter_non_root_mode(vcpu, true);
+ vmx->nested.nested_run_pending = !ret;
+ if (ret > 0)
return 1;
- }
+ else if (ret)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
/*
- * Must happen outside of enter_vmx_non_root_mode() as it will
+ * Must happen outside of nested_vmx_enter_non_root_mode() as it will
* also be used as part of restoring nVMX state for
* snapshot restore (migration).
*
@@ -12802,9 +13521,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return kvm_vcpu_halt(vcpu);
}
return 1;
-
-out:
- return kvm_skip_emulated_instruction(vcpu);
}
/*
@@ -13118,24 +13834,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
kvm_clear_interrupt_queue(vcpu);
}
-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
-{
- u32 entry_failure_code;
-
- nested_ept_uninit_mmu_context(vcpu);
-
- /*
- * Only PDPTE load can fail as the value of cr3 was checked on entry and
- * couldn't have changed.
- */
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
- nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-}
-
/*
* A part of what we need to when the nested L2 guest exits and we want to
* run its L1 parent, is to reset L1's guest state to the host state specified
@@ -13149,6 +13847,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
+ u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -13161,6 +13860,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ vmx_set_interrupt_shadow(vcpu, 0);
+
/*
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
* actually changed, because vmx_set_cr0 refers to efer set above.
@@ -13175,23 +13876,35 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
vmx_set_cr4(vcpu, vmcs12->host_cr4);
- load_vmcs12_mmu_host_state(vcpu, vmcs12);
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
/*
- * If vmcs01 don't use VPID, CPU flushes TLB on every
+ * If vmcs01 doesn't use VPID, CPU flushes TLB on every
* VMEntry/VMExit. Thus, no need to flush TLB.
*
- * If vmcs12 uses VPID, TLB entries populated by L2 are
- * tagged with vmx->nested.vpid02 while L1 entries are tagged
- * with vmx->vpid. Thus, no need to flush TLB.
+ * If vmcs12 doesn't use VPID, L1 expects TLB to be
+ * flushed on every VMEntry/VMExit.
+ *
+ * Otherwise, we can preserve TLB entries as long as we are
+ * able to tag L1 TLB entries differently than L2 TLB entries.
*
- * Therefore, flush TLB only in case vmcs01 uses VPID and
- * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
- * are both tagged with vmx->vpid.
+ * If vmcs12 uses EPT, we need to execute this flush on EPTP01
+ * and therefore we request the TLB flush to happen only after VMCS EPTP
+ * has been set by KVM_REQ_LOAD_CR3.
*/
if (enable_vpid &&
- !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
- vmx_flush_tlb(vcpu, true);
+ (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
@@ -13271,6 +13984,140 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
}
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct shared_msr_entry *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer)
+ return host_efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = find_msr_entry(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ struct msr_data msr;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ msr.host_initiated = false;
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ msr.index = h.index;
+ msr.data = h.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
/*
* Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
* and modify vmcs12 to make it see what it would expect to see there if
@@ -13286,14 +14133,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
- /*
- * The only expected VM-instruction error is "VM entry with
- * invalid control field(s)." Anything else indicates a
- * problem with L0.
- */
- WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
- VMXERR_ENTRY_INVALID_CONTROL_FIELD));
-
leave_guest_mode(vcpu);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -13320,12 +14159,19 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ } else {
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0. And we should never get here with a
+ * VMFail of any type if early consistency checks are enabled.
+ */
+ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ WARN_ON_ONCE(nested_early_check);
}
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- vm_entry_controls_reset_shadow(vmx);
- vm_exit_controls_reset_shadow(vmx);
- vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -13369,8 +14215,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs && exit_reason != -1)
- vmx->nested.sync_shadow_vmcs = true;
+ if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+ vmx->nested.need_vmcs12_sync = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -13405,24 +14251,24 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
return;
}
-
+
/*
* After an early L2 VM-entry failure, we're now back
* in L1 which thinks it just finished a VMLAUNCH or
* VMRESUME instruction, so we need to set the failure
* flag and the VM-instruction error field of the VMCS
- * accordingly.
+ * accordingly, and skip the emulated instruction.
*/
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-
- load_vmcs12_mmu_host_state(vcpu, vmcs12);
+ (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/*
- * The emulated instruction was already skipped in
- * nested_vmx_run, but the updated RIP was never
- * written back to the vmcs01.
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
*/
- skip_emulated_instruction(vcpu);
+ nested_vmx_restore_host_state(vcpu);
+
vmx->fail = 0;
}
@@ -13435,26 +14281,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.nested_run_pending = 0;
nested_vmx_vmexit(vcpu, -1, 0, 0);
}
- free_nested(to_vmx(vcpu));
-}
-
-/*
- * L1's failure to enter L2 is a subset of a normal exit, as explained in
- * 23.7 "VM-entry failures during or after loading guest state" (this also
- * lists the acceptable exit-reason and exit-qualification parameters).
- * It should only be called before L2 actually succeeded to run, and when
- * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
- */
-static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 reason, unsigned long qualification)
-{
- load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
- vmcs12->exit_qualification = qualification;
- nested_vmx_succeed(vcpu);
- if (enable_shadow_vmcs)
- to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
+ free_nested(vcpu);
}
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -13880,7 +14707,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
if (vmx->nested.smm.guest_mode) {
vcpu->arch.hflags &= ~HF_SMM_MASK;
- ret = enter_vmx_non_root_mode(vcpu, NULL);
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
vcpu->arch.hflags |= HF_SMM_MASK;
if (ret)
return ret;
@@ -13895,6 +14722,20 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * In case we do two consecutive get/set_nested_state()s while L2 was
+ * running hv_evmcs may end up not being mapped (we map it from
+ * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
+ * have vmcs12 if it is true.
+ */
+ return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
+ vmx->nested.hv_evmcs;
+}
+
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
u32 user_data_size)
@@ -13914,12 +14755,16 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
+ kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
if (nested_vmx_allowed(vcpu) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
- if (vmx->nested.current_vmptr != -1ull) {
+ if (vmx_has_valid_vmcs12(vcpu)) {
kvm_state.size += VMCS12_SIZE;
if (is_guest_mode(vcpu) &&
@@ -13948,20 +14793,24 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
return -EFAULT;
- if (vmx->nested.current_vmptr == -1ull)
+ if (!vmx_has_valid_vmcs12(vcpu))
goto out;
/*
* When running L2, the authoritative vmcs12 state is in the
* vmcs02. When running L1, the authoritative vmcs12 state is
- * in the shadow vmcs linked to vmcs01, unless
- * sync_shadow_vmcs is set, in which case, the authoritative
+ * in the shadow or enlightened vmcs linked to vmcs01, unless
+ * need_vmcs12_sync is set, in which case, the authoritative
* vmcs12 state is in the vmcs12 already.
*/
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu)) {
sync_vmcs12(vcpu, vmcs12);
- else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
- copy_shadow_to_vmcs12(vmx);
+ } else if (!vmx->nested.need_vmcs12_sync) {
+ if (vmx->nested.hv_evmcs)
+ copy_enlightened_to_vmcs12(vmx);
+ else if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+ }
if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
return -EFAULT;
@@ -13989,6 +14838,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != 0)
return -EINVAL;
+ if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
+ nested_enable_evmcs(vcpu, NULL);
+
if (!nested_vmx_allowed(vcpu))
return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
@@ -14006,13 +14858,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
return -EINVAL;
- if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
- return -EINVAL;
-
- if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
- !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
- return -EINVAL;
-
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
return -EINVAL;
@@ -14042,7 +14887,25 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (ret)
return ret;
- set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+ /* Empty 'VMXON' state is permitted */
+ if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ return 0;
+
+ if (kvm_state->vmx.vmcs_pa != -1ull) {
+ if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+ return -EINVAL;
+
+ set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+ } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
+ /*
+ * Sync eVMCS upon entry as we may not have
+ * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
+ */
+ vmx->nested.need_vmcs12_sync = true;
+ } else {
+ return -EINVAL;
+ }
if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
vmx->nested.smm.vmxon = true;
@@ -14086,7 +14949,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
vmx->nested.dirty_vmcs12 = true;
- ret = enter_vmx_non_root_mode(vcpu, NULL);
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
return -EINVAL;
@@ -14238,6 +15101,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.pre_enter_smm = vmx_pre_enter_smm,
.pre_leave_smm = vmx_pre_leave_smm,
.enable_smi_window = enable_smi_window,
+
+ .nested_enable_evmcs = nested_enable_evmcs,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
index cd0c75f6d037..132432f375c2 100644
--- a/arch/x86/kvm/vmx_shadow_fields.h
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -28,7 +28,6 @@
*/
/* 16-bits */
-SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
SHADOW_FIELD_RW(GUEST_INTR_STATUS)
SHADOW_FIELD_RW(GUEST_PML_INDEX)
SHADOW_FIELD_RW(HOST_FS_SELECTOR)
@@ -47,8 +46,8 @@ SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
SHADOW_FIELD_RW(TPR_THRESHOLD)
-SHADOW_FIELD_RW(GUEST_CS_LIMIT)
SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_SS_AR_BYTES)
SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
@@ -61,8 +60,6 @@ SHADOW_FIELD_RW(GUEST_CR0)
SHADOW_FIELD_RW(GUEST_CR3)
SHADOW_FIELD_RW(GUEST_CR4)
SHADOW_FIELD_RW(GUEST_RFLAGS)
-SHADOW_FIELD_RW(GUEST_CS_BASE)
-SHADOW_FIELD_RW(GUEST_ES_BASE)
SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
SHADOW_FIELD_RW(CR0_READ_SHADOW)
SHADOW_FIELD_RW(CR4_READ_SHADOW)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca717737347e..bdcb5babfb68 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,7 +136,7 @@ static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 1000;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
@@ -400,9 +400,51 @@ static int exception_type(int vector)
return EXCPT_FAULT;
}
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+{
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_payload = vcpu->arch.exception.has_payload;
+ unsigned long payload = vcpu->arch.exception.payload;
+
+ if (!has_payload)
+ return;
+
+ switch (nr) {
+ case DB_VECTOR:
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
+ /*
+ * DR6.RTM is set by all #DB exceptions that don't clear it.
+ */
+ vcpu->arch.dr6 |= DR6_RTM;
+ vcpu->arch.dr6 |= payload;
+ /*
+ * Bit 16 should be set in the payload whenever the #DB
+ * exception should clear DR6.RTM. This makes the payload
+ * compatible with the pending debug exceptions under VMX.
+ * Though not currently documented in the SDM, this also
+ * makes the payload compatible with the exit qualification
+ * for #DB exceptions under VMX.
+ */
+ vcpu->arch.dr6 ^= payload & DR6_RTM;
+ break;
+ case PF_VECTOR:
+ vcpu->arch.cr2 = payload;
+ break;
+ }
+
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
- bool reinject)
+ bool has_payload, unsigned long payload, bool reinject)
{
u32 prev_nr;
int class1, class2;
@@ -424,6 +466,14 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
*/
WARN_ON_ONCE(vcpu->arch.exception.pending);
vcpu->arch.exception.injected = true;
+ if (WARN_ON_ONCE(has_payload)) {
+ /*
+ * A reinjected event has already
+ * delivered its payload.
+ */
+ has_payload = false;
+ payload = 0;
+ }
} else {
vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
@@ -431,6 +481,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = has_payload;
+ vcpu->arch.exception.payload = payload;
+ /*
+ * In guest mode, payload delivery should be deferred,
+ * so that the L1 hypervisor can intercept #PF before
+ * CR2 is modified (or intercept #DB before DR6 is
+ * modified under nVMX). However, for ABI
+ * compatibility with KVM_GET_VCPU_EVENTS and
+ * KVM_SET_VCPU_EVENTS, we can't delay payload
+ * delivery unless userspace has enabled this
+ * functionality via the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled ||
+ !is_guest_mode(vcpu))
+ kvm_deliver_exception_payload(vcpu);
return;
}
@@ -455,6 +521,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
} else
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
@@ -464,16 +532,29 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, true);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+ unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+}
+
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+ u32 error_code, unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code,
+ true, payload, false);
+}
+
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
if (err)
@@ -490,11 +571,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
++vcpu->stat.pf_guest;
vcpu->arch.exception.nested_apf =
is_guest_mode(vcpu) && fault->async_page_fault;
- if (vcpu->arch.exception.nested_apf)
+ if (vcpu->arch.exception.nested_apf) {
vcpu->arch.apf.nested_apf_token = fault->address;
- else
- vcpu->arch.cr2 = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+ } else {
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+ fault->address);
+ }
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
@@ -503,7 +586,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ vcpu->arch.mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
}
@@ -517,13 +600,13 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
@@ -602,7 +685,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
- vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+ vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
@@ -2477,7 +2560,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_PV_EOI_EN:
- if (kvm_lapic_enable_pv_eoi(vcpu, data))
+ if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
break;
@@ -2912,6 +2995,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_HYPERV_SEND_IPI:
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2930,6 +3015,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
+ case KVM_CAP_EXCEPTION_PAYLOAD:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -3185,11 +3271,16 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
- * If userspace has set any breakpoints or watchpoints, dr6 is restored
- * on every vmexit, but if not, we might have a stale dr6 from the
- * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+ * Here dr6 is either zero or, if the guest has run and userspace
+ * has not set any breakpoints or watchpoints, it can be set to
+ * the guest dr6 (stored in vcpu->arch.dr6). do_debug expects dr6
+ * to be cleared after it runs, so clear the host register. However,
+ * MOV to DR can be expensive when running nested, omit it if
+ * vcpu->arch.dr6 is already zero: in that case, the host dr6 cannot
+ * currently be nonzero.
*/
- set_debugreg(0, 6);
+ if (vcpu->arch.dr6)
+ set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -3362,19 +3453,33 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
process_nmi(vcpu);
+
/*
- * FIXME: pass injected and pending separately. This is only
- * needed for nested virtualization, whose state cannot be
- * migrated yet. For now we can combine them.
+ * The API doesn't provide the instruction length for software
+ * exceptions, so don't report them. As long as the guest RIP
+ * isn't advanced, we should expect to encounter the exception
+ * again.
*/
- events->exception.injected =
- (vcpu->arch.exception.pending ||
- vcpu->arch.exception.injected) &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+ events->exception.injected = 0;
+ events->exception.pending = 0;
+ } else {
+ events->exception.injected = vcpu->arch.exception.injected;
+ events->exception.pending = vcpu->arch.exception.pending;
+ /*
+ * For ABI compatibility, deliberately conflate
+ * pending and injected exceptions when
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ events->exception.injected |=
+ vcpu->arch.exception.pending;
+ }
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
events->exception.error_code = vcpu->arch.exception.error_code;
+ events->exception_has_payload = vcpu->arch.exception.has_payload;
+ events->exception_payload = vcpu->arch.exception.payload;
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -3398,6 +3503,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SHADOW
| KVM_VCPUEVENT_VALID_SMM);
+ if (vcpu->kvm->arch.exception_payload_enabled)
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -3409,12 +3517,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW
- | KVM_VCPUEVENT_VALID_SMM))
+ | KVM_VCPUEVENT_VALID_SMM
+ | KVM_VCPUEVENT_VALID_PAYLOAD))
return -EINVAL;
- if (events->exception.injected &&
- (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
- is_guest_mode(vcpu)))
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ return -EINVAL;
+ if (events->exception.pending)
+ events->exception.injected = 0;
+ else
+ events->exception_has_payload = 0;
+ } else {
+ events->exception.pending = 0;
+ events->exception_has_payload = 0;
+ }
+
+ if ((events->exception.injected || events->exception.pending) &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
/* INITs are latched while in SMM */
@@ -3424,11 +3544,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return -EINVAL;
process_nmi(vcpu);
- vcpu->arch.exception.injected = false;
- vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.injected = events->exception.injected;
+ vcpu->arch.exception.pending = events->exception.pending;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
+ vcpu->arch.exception.payload = events->exception_payload;
vcpu->arch.interrupt.injected = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
@@ -3694,6 +3816,10 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
+ int r;
+ uint16_t vmcs_version;
+ void __user *user_ptr;
+
if (cap->flags)
return -EINVAL;
@@ -3706,6 +3832,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return -EINVAL;
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
+ if (!r) {
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
+ if (copy_to_user(user_ptr, &vmcs_version,
+ sizeof(vmcs_version)))
+ r = -EFAULT;
+ }
+ return r;
+
default:
return -EINVAL;
}
@@ -4047,11 +4183,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
if (kvm_state.flags &
- ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
+ | KVM_STATE_NESTED_EVMCS))
break;
/* nested_run_pending implies guest_mode. */
- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
@@ -4363,6 +4501,10 @@ split_irqchip_unlock:
kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
r = 0;
break;
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ kvm->arch.exception_payload_enabled = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -4803,7 +4945,7 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
return t_gpa;
}
@@ -5889,7 +6031,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
if (WARN_ON_ONCE(is_guest_mode(vcpu)))
return false;
- if (!vcpu->arch.mmu.direct_map) {
+ if (!vcpu->arch.mmu->direct_map) {
/*
* Write permission should be allowed since only
* write access need to be emulated.
@@ -5922,7 +6064,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
kvm_release_pfn_clean(pfn);
/* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu.direct_map) {
+ if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
spin_lock(&vcpu->kvm->mmu_lock);
@@ -5989,7 +6131,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
vcpu->arch.last_retry_eip = ctxt->eip;
vcpu->arch.last_retry_addr = cr2;
- if (!vcpu->arch.mmu.direct_map)
+ if (!vcpu->arch.mmu->direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6049,14 +6191,7 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
} else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
}
}
@@ -6995,10 +7130,22 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
- if (vcpu->arch.exception.nr == DB_VECTOR &&
- (vcpu->arch.dr7 & DR7_GD)) {
- vcpu->arch.dr7 &= ~DR7_GD;
- kvm_update_dr7(vcpu);
+ if (vcpu->arch.exception.nr == DB_VECTOR) {
+ /*
+ * This code assumes that nSVM doesn't use
+ * check_nested_events(). If it does, the
+ * DR6/DR7 changes should happen before L1
+ * gets a #VMEXIT for an intercepted #DB in
+ * L2. (Under VMX, on the other hand, the
+ * DR6/DR7 changes should not happen in the
+ * event of a VM-exit to L1 for an intercepted
+ * #DB in L2.)
+ */
+ kvm_deliver_exception_payload(vcpu);
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
}
kvm_x86_ops->queue_exception(vcpu);
@@ -8478,7 +8625,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_vcpu_reset(vcpu, false);
- kvm_mmu_setup(vcpu);
+ kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
}
@@ -9327,7 +9474,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -9335,11 +9482,11 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
if (unlikely(r))
return;
- if (!vcpu->arch.mmu.direct_map &&
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ if (!vcpu->arch.mmu->direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
return;
- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+ vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -9463,6 +9610,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
vcpu->arch.exception.nr = 0;
vcpu->arch.exception.has_error_code = false;
vcpu->arch.exception.error_code = 0;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 67b9568613f3..224cd0a47568 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
int handle_ud(struct kvm_vcpu *vcpu);
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
+
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26c1115..faca978ebf9d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}
+void __weak mem_encrypt_free_decrypted_mem(void) { }
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
+ mem_encrypt_free_decrypted_mem();
+
free_kernel_image_pages(&__init_begin, &__init_end);
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b2de398d1fd3..006f373f54ab 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -348,6 +348,30 @@ bool sev_active(void)
EXPORT_SYMBOL(sev_active);
/* Architecture __weak replacement functions */
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (mem_encrypt_active()) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
+
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ae394552fb94..089e78c4effd 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
+#ifdef CONFIG_X86_64
+ /*
+ * Ensure that the static initial page tables are covering the
+ * fixmap completely.
+ */
+ BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
+ (FIXMAP_PMD_NUM * PTRS_PER_PTE));
+#endif
+
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2fe5c9b1816b..dd461c0167ef 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* L3_k[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
- /* L3_k[511][506] -> level1_fixmap_pgt */
+ /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
convert_pfn_mfn(level2_fixmap_pgt);
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
@@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
+
+ for (i = 0; i < FIXMAP_PMD_NUM; i++) {
+ set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
+ PAGE_KERNEL_RO);
+ }
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 7d00d4ad44d4..95997e6c0696 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -478,7 +478,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
{
int err, ret = IRQ_NONE;
- struct pt_regs regs;
+ struct pt_regs regs = {0};
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
OpenPOWER on IntegriCloud