summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/ia32/ia32_aout.c4
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h40
-rw-r--r--arch/x86/include/asm/mem_encrypt.h4
-rw-r--r--arch/x86/include/asm/mpspec.h1
-rw-r--r--arch/x86/include/asm/paravirt.h37
-rw-r--r--arch/x86/include/asm/paravirt_types.h9
-rw-r--r--arch/x86/include/asm/pgtable.h44
-rw-r--r--arch/x86/include/asm/pgtable_64.h14
-rw-r--r--arch/x86/include/asm/pgtable_types.h10
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/string_32.h24
-rw-r--r--arch/x86/include/asm/string_64.h36
-rw-r--r--arch/x86/include/asm/svm.h6
-rw-r--r--arch/x86/include/asm/vmx.h22
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h6
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/common.c49
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c4
-rw-r--r--arch/x86/kernel/io_delay.c2
-rw-r--r--arch/x86/kernel/paravirt.c5
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/signal_compat.c21
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/kvm/cpuid.c34
-rw-r--r--arch/x86/kvm/cpuid.h186
-rw-r--r--arch/x86/kvm/emulate.c42
-rw-r--r--arch/x86/kvm/hyperv.c8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu.c267
-rw-r--r--arch/x86/kvm/mmu.h23
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/mtrr.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/svm.c139
-rw-r--r--arch/x86/kvm/trace.h11
-rw-r--r--arch/x86/kvm/vmx.c622
-rw-r--r--arch/x86/kvm/x86.c213
-rw-r--r--arch/x86/kvm/x86.h54
-rw-r--r--arch/x86/mm/init.c34
-rw-r--r--arch/x86/mm/init_64.c22
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/tlb.c24
-rw-r--r--arch/x86/pci/fixup.c17
-rw-r--r--arch/x86/pci/irq.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c21
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/mmu_pv.c22
-rw-r--r--arch/x86/xen/p2m.c25
-rw-r--r--arch/x86/xen/setup.c5
65 files changed, 1336 insertions, 872 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b278a33ccbb..971feac13506 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,7 +53,6 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
- select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
# Causing hangs/crashes, see the commit that added this change for details.
select ARCH_HAS_REFCOUNT if BROKEN
@@ -2323,6 +2322,10 @@ source "kernel/livepatch/Kconfig"
endmenu
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2343,6 +2346,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
def_bool y
depends on X86_64 && HUGETLB_PAGE && MIGRATION
+config ARCH_ENABLE_THP_MIGRATION
+ def_bool y
+ depends on X86_64 && TRANSPARENT_HUGEPAGE
+
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 8d0879f1d42c..8e02b30cf08e 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -407,10 +407,10 @@ static int load_aout_library(struct file *file)
unsigned long bss, start_addr, len, error;
int retval;
struct exec ex;
-
+ loff_t pos = 0;
retval = -ENOEXEC;
- error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+ error = kernel_read(file, &ex, sizeof(ex), &pos);
if (error != sizeof(ex))
goto out;
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 8b4140f6724f..cb9a1af109b4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,6 +7,4 @@
void clflush_cache_range(void *addr, unsigned int size);
-#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 42bbbf0f173d..2519c6c801c9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -288,6 +288,7 @@
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1a2ba368da39..9d0e13738ed3 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -121,7 +121,6 @@ static inline int desc_empty(const void *ptr)
#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
#define load_TLS(t, cpu) native_load_tls(t, cpu)
@@ -228,7 +227,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr)
asm volatile("sgdt %0":"=m" (*dtr));
}
-static inline void native_store_idt(struct desc_ptr *dtr)
+static inline void store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index fde36f189836..fa2558e12024 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -219,8 +219,8 @@ struct x86_emulate_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
- void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 369e41c23f07..8844eee290b2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,15 +79,14 @@
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
#define CR3_PCID_INVD BIT_64(63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
- | X86_CR4_PKE))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -204,7 +203,6 @@ enum {
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
- PFERR_USER_MASK | \
PFERR_WRITE_MASK | \
PFERR_PRESENT_MASK)
@@ -317,15 +315,17 @@ struct kvm_pio_request {
int size;
};
+#define PT64_ROOT_MAX_LEVEL 5
+
struct rsvd_bits_validate {
- u64 rsvd_bits_mask[2][4];
+ u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
};
/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
*/
struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
@@ -548,8 +548,8 @@ struct kvm_vcpu_arch {
struct kvm_queued_exception {
bool pending;
+ bool injected;
bool has_error_code;
- bool reinject;
u8 nr;
u32 error_code;
u8 nested_apf;
@@ -687,8 +687,12 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
- /* GPA available (AMD only) */
+ /* GPA available */
bool gpa_available;
+ gpa_t gpa_val;
+
+ /* be preempted when it's in kernel-mode(cpl=0) */
+ bool preempted_in_kernel;
};
struct kvm_lpage_info {
@@ -979,7 +983,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
- int (*get_tdp_level)(void);
+ int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
@@ -1297,20 +1301,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-static inline u64 get_canonical(u64 la)
-{
- return ((int64_t)la << 16) >> 16;
-}
-
-static inline bool is_noncanonical_address(u64 la)
-{
-#ifdef CONFIG_X86_64
- return get_canonical(la) != la;
-#else
- return false;
-#endif
-}
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 8e618fcf1f7c..6a77c63540f7 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,7 +21,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
-extern unsigned long sme_me_mask;
+extern u64 sme_me_mask;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
@@ -49,7 +49,7 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
#else /* !CONFIG_AMD_MEM_ENCRYPT */
-#define sme_me_mask 0UL
+#define sme_me_mask 0ULL
static inline void __init sme_early_encrypt(resource_size_t paddr,
unsigned long size) { }
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 831eb7895535..c471ca1f9412 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -86,7 +86,6 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
-int __generic_processor_info(int apicid, int version, bool enabled);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c25dd22f7c70..12deec722cf0 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -71,11 +71,6 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
-static inline unsigned long __read_cr4(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
-}
-
static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
@@ -228,10 +223,6 @@ static inline void set_ldt(const void *addr, unsigned entries)
{
PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
}
-static inline void store_idt(struct desc_ptr *dtr)
-{
- PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
-}
static inline unsigned long paravirt_store_tr(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
@@ -365,12 +356,6 @@ static inline void paravirt_release_p4d(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
}
-static inline void pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
-}
-
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
@@ -472,28 +457,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
-static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
-{
- if (sizeof(pmdval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
- else
- PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
- native_pmd_val(pmd));
-}
-
-static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
-{
- if (sizeof(pudval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
- else
- PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
- native_pud_val(pud));
-}
-
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6b64fc6367f2..42873edd9f9d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -107,7 +107,6 @@ struct pv_cpu_ops {
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
- unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
#ifdef CONFIG_X86_64
@@ -119,8 +118,6 @@ struct pv_cpu_ops {
void (*load_tr_desc)(void);
void (*load_gdt)(const struct desc_ptr *);
void (*load_idt)(const struct desc_ptr *);
- /* store_gdt has been removed. */
- void (*store_idt)(struct desc_ptr *);
void (*set_ldt)(const void *desc, unsigned entries);
unsigned long (*store_tr)(void);
void (*load_tls)(struct thread_struct *t, unsigned int cpu);
@@ -245,12 +242,6 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmdval);
- void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pudval);
- void (*pte_update)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbeae4a2bd01..b714934512b3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -55,8 +55,6 @@ extern pmdval_t early_pmd_flags;
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
-#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
-#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
@@ -87,8 +85,6 @@ extern pmdval_t early_pmd_flags;
#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd) native_pmd_clear(pmd)
-#define pte_update(mm, addr, ptep) do { } while (0)
-
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -979,31 +975,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
native_set_pte(ptep, pte);
}
-static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp , pmd_t pmd)
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
{
native_set_pmd(pmdp, pmd);
}
-static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
{
native_set_pud(pudp, pud);
}
-#ifndef CONFIG_PARAVIRT
-/*
- * Rules for using pte_update - it must be called after any PTE update which
- * has not been done using the set_pte / clear_pte interfaces. It is used by
- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
- * updates should either be sets, clears, or set_pte_atomic for P->P
- * transitions, which means this hook should only be called for user PTEs.
- * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.
- */
-#define pte_update(mm, addr, ptep) do { } while (0)
-#endif
-
/*
* We only update the dirty/accessed state if we set
* the dirty bit by hand in the kernel, since the hardware
@@ -1031,7 +1014,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
- pte_update(mm, addr, ptep);
return pte;
}
@@ -1058,7 +1040,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
- pte_update(mm, addr, ptep);
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
@@ -1172,6 +1153,23 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
#endif
#define PKRU_AD_BIT 0x1
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2160c1fee920..972a4698c530 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -180,15 +180,21 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/*
* Encode and de-code a swap entry
*
- * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
- * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
- * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+ * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
* there. We also need to avoid using A and D because of an
* erratum where they can be incorrectly set by hardware on
* non-present PTEs.
+ *
+ * SD (1) in swp entry is used to store soft dirty bit, which helps us
+ * remember soft dirty over page migration
+ *
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
*/
#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
#define SWP_TYPE_BITS 5
@@ -204,7 +210,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
((type) << (SWP_TYPE_FIRST_BIT)) \
| ((offset) << SWP_OFFSET_FIRST_BIT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 399261ce904c..f1492473f10e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -99,15 +99,15 @@
/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
- * with swap entry format. On x86 bits 6 and 7 are *not* involved
- * into swap entry computation, but bit 6 is used for nonlinear
- * file mapping, so we borrow bit 7 for soft dirty tracking.
+ * with swap entry format. On x86 bits 1-4 are *not* involved
+ * into swap entry computation, but bit 7 is used for thp migration,
+ * so we borrow bit 1 for soft dirty tracking.
*
* Please note that this bit must be treated as swap dirty page
- * mark if and only if the PTE has present bit clear!
+ * mark if and only if the PTE/PMD has present bit clear!
*/
#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 9efaabf5b54b..a24dfcf79f4a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -135,6 +135,11 @@ static inline void native_wbinvd(void)
extern asmlinkage void native_load_gs_index(unsigned);
+static inline unsigned long __read_cr4(void)
+{
+ return native_read_cr4();
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -173,11 +178,6 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}
-static inline unsigned long __read_cr4(void)
-{
- return native_read_cr4();
-}
-
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9ee84873de5..e371e7229042 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -340,6 +340,30 @@ extern void *memset(void *, int, size_t);
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
/*
* find the first occurrence of byte 'c', or 1 past the area if none
*/
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2a8c822de1fc..f372a70a523f 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -58,6 +58,42 @@ extern void *__memcpy(void *to, const void *from, size_t len);
void *memset(void *s, int c, size_t n);
void *__memset(void *s, int c, size_t n);
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosq"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
void *__memmove(void *dest, const void *src, size_t count);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 58fffe79e417..14835dd205a5 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -107,6 +107,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_IRQ_SHIFT 8
#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
#define V_INTR_PRIO_SHIFT 16
#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
@@ -116,6 +119,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_INTR_MASKING_SHIFT 24
#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 35cd06f636ab..caec8417539f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,6 +72,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_RDRAND 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_RDSEED 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
@@ -114,6 +115,10 @@
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
+#define VMFUNC_EPTP_ENTRIES 512
+
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
return vmx_basic & GENMASK_ULL(30, 0);
@@ -187,6 +192,8 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH = 0x00002015,
POSTED_INTR_DESC_ADDR = 0x00002016,
POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ VM_FUNCTION_CONTROL = 0x00002018,
+ VM_FUNCTION_CONTROL_HIGH = 0x00002019,
EPT_POINTER = 0x0000201a,
EPT_POINTER_HIGH = 0x0000201b,
EOI_EXIT_BITMAP0 = 0x0000201c,
@@ -197,6 +204,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP2_HIGH = 0x00002021,
EOI_EXIT_BITMAP3 = 0x00002022,
EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ EPTP_LIST_ADDRESS = 0x00002024,
+ EPTP_LIST_ADDRESS_HIGH = 0x00002025,
VMREAD_BITMAP = 0x00002026,
VMWRITE_BITMAP = 0x00002028,
XSS_EXIT_BITMAP = 0x0000202C,
@@ -444,6 +453,7 @@ enum vmcs_field {
#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
@@ -459,12 +469,14 @@ enum vmcs_field {
#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */
-#define VMX_EPT_DEFAULT_GAW 3
-#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
-#define VMX_EPT_GAW_EPTP_SHIFT 3
-#define VMX_EPT_AD_ENABLE_BIT (1ull << 6)
-#define VMX_EPT_DEFAULT_MT 0x6ull
+#define VMX_EPTP_PWL_MASK 0x38ull
+#define VMX_EPTP_PWL_4 0x18ull
+#define VMX_EPTP_PWL_5 0x20ull
+#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
+#define VMX_EPTP_MT_MASK 0x7ull
+#define VMX_EPTP_MT_WB 0x6ull
+#define VMX_EPTP_MT_UC 0x0ull
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8417ef7c3885..07b6531813c4 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -158,9 +158,6 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
unsigned long pfn;
int ret;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return mfn;
-
if (unlikely(mfn >= machine_to_phys_nr))
return ~0;
@@ -317,8 +314,6 @@ static inline pte_t __pte_ma(pteval_t x)
#define p4d_val_ma(x) ((x).p4d)
#endif
-void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
-
xmaddr_t arbitrary_virt_to_machine(void *address);
unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 7032f4d8dff3..f65d12504e80 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,12 +153,6 @@
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
/*
- * HV_VP_SET available
- */
-#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
-
-
-/*
* Crash notification flag.
*/
#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f8ae286c1502..079535e53e2a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1373,7 +1373,7 @@ static void __init acpi_reduced_hw_init(void)
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact linux-acpi@vger.kernel.org
*/
-static struct dmi_system_id __initdata acpi_dmi_table[] = {
+static const struct dmi_system_id acpi_dmi_table[] __initconst = {
/*
* Boxes that need ACPI disabled
*/
@@ -1448,7 +1448,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
};
/* second table for DMI checks that should run after early-quirks */
-static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
+static const struct dmi_system_id acpi_dmi_table_late[] __initconst = {
/*
* HP laptops which use a DSDT reporting as HP/SB400/10000,
* which includes some code which overrides all temperature
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7834f73efbf1..d705c769f77d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2097,7 +2097,7 @@ static int allocate_logical_cpuid(int apicid)
/* Allocate a new cpuid. */
if (nr_logical_cpuids >= nr_cpu_ids) {
- WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. "
+ WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
"Processor %d/0x%x and the rest are ignored.\n",
nr_cpu_ids, nr_logical_cpuids, apicid);
return -EINVAL;
@@ -2130,7 +2130,7 @@ int generic_processor_info(int apicid, int version)
* Since fixing handling of boot_cpu_physical_apicid requires
* another discussion and tests on each platform, we leave it
* for now and here we use read_apic_id() directly in this
- * function, __generic_processor_info().
+ * function, generic_processor_info().
*/
if (disabled_cpu_apicid != BAD_APICID &&
disabled_cpu_apicid != read_apic_id() &&
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 446b0d3d4932..e4b0d92b3ae0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2043,7 +2043,7 @@ static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __initdata apm_dmi_table[] = {
+static const struct dmi_system_id apm_dmi_table[] __initconst = {
{
print_if_true,
KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fb1d3358a4af..775f10100d7f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -169,21 +169,21 @@ static int __init x86_mpx_setup(char *s)
__setup("nompx", x86_mpx_setup);
#ifdef CONFIG_X86_64
-static int __init x86_pcid_setup(char *s)
+static int __init x86_nopcid_setup(char *s)
{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
/* do not emit a message if the feature is not present */
if (!boot_cpu_has(X86_FEATURE_PCID))
- return 1;
+ return 0;
setup_clear_cpu_cap(X86_FEATURE_PCID);
pr_info("nopcid: PCID feature disabled\n");
- return 1;
+ return 0;
}
-__setup("nopcid", x86_pcid_setup);
+early_param("nopcid", x86_nopcid_setup);
#endif
static int __init x86_noinvpcid_setup(char *s)
@@ -329,38 +329,6 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
-static void setup_pcid(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_PCID)) {
- if (cpu_has(c, X86_FEATURE_PGE)) {
- /*
- * We'd like to use cr4_set_bits_and_update_boot(),
- * but we can't. CR4.PCIDE is special and can only
- * be set in long mode, and the early CPU init code
- * doesn't know this and would try to restore CR4.PCIDE
- * prior to entering long mode.
- *
- * Instead, we rely on the fact that hotplug, resume,
- * etc all fully restore CR4 before they write anything
- * that could have nonzero PCID bits to CR3. CR4.PCIDE
- * has no effect on the page tables themselves, so we
- * don't need it to be restored early.
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
- clear_cpu_cap(c, X86_FEATURE_PCID);
- }
- }
-}
-
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1175,9 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
- /* Set up PCID */
- setup_pcid(c);
-
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 3b3f713e15e5..236324e83a3a 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -59,8 +59,6 @@ void hyperv_vector_handler(struct pt_regs *regs)
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
- /* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
@@ -251,6 +249,8 @@ static void __init ms_hyperv_init_platform(void)
*/
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
+ /* Setup the IDT for hypervisor callback */
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
#endif
}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 50c89e8a95f2..7ebcc4a74438 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -58,7 +58,7 @@ static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
* Quirk table for systems that misbehave (lock up, etc.) if port
* 0x80 is used:
*/
-static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
+static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = {
{
.callback = dmi_io_delay_0xed_port,
.ident = "Compaq Presario V6000",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a14df9eecfed..19a3e8f961c7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -327,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_debugreg = native_set_debugreg,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
- .read_cr4 = native_read_cr4,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
@@ -343,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
- .store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
#ifdef CONFIG_X86_64
@@ -411,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
- .set_pmd_at = native_set_pmd_at,
- .pte_update = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
@@ -424,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
- .set_pud_at = native_set_pud_at,
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54984b142641..54180fa6f66f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -150,7 +150,7 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
/*
* This is a single dmi_table handling all reboot quirks.
*/
-static struct dmi_system_id __initdata reboot_dmi_table[] = {
+static const struct dmi_system_id reboot_dmi_table[] __initconst = {
/* Acer */
{ /* Handle reboot issue on Acer Aspire one */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d84afb0a322d..0957dd73d127 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1178,8 +1178,11 @@ void __init setup_arch(char **cmdline_p)
* with the current CR4 value. This may not be necessary, but
* auditing all the early-boot CR4 manipulation would be needed to
* rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
*/
- mmu_cr4_features = __read_cr4();
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
memblock_set_current_limit(get_max_mapped());
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 6e8fcb6f7e1e..28dafed6c682 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -168,7 +168,7 @@ void __init setup_per_cpu_areas(void)
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 71beb28600d4..ab9feb5887b1 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -129,7 +129,7 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
3 ints plus the relevant union member. */
put_user_ex(from->si_signo, &to->si_signo);
put_user_ex(from->si_errno, &to->si_errno);
- put_user_ex((short)from->si_code, &to->si_code);
+ put_user_ex(from->si_code, &to->si_code);
if (from->si_code < 0) {
put_user_ex(from->si_pid, &to->si_pid);
@@ -142,8 +142,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
*/
put_user_ex(from->_sifields._pad[0],
&to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_FAULT:
if (from->si_signo == SIGBUS &&
(from->si_code == BUS_MCEERR_AR ||
from->si_code == BUS_MCEERR_AO))
@@ -160,11 +160,11 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
put_user_ex(from->si_pkey, &to->si_pkey);
}
break;
- case __SI_SYS >> 16:
+ case SIL_SYS:
put_user_ex(from->si_syscall, &to->si_syscall);
put_user_ex(from->si_arch, &to->si_arch);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
if (!x32_ABI) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
@@ -174,21 +174,18 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
}
put_user_ex(from->si_status, &to->si_status);
/* FALL THROUGH */
- default:
- case __SI_KILL >> 16:
+ case SIL_KILL:
put_user_ex(from->si_uid, &to->si_uid);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
put_user_ex(from->si_fd, &to->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
put_user_ex(from->si_overrun, &to->si_overrun);
put_user_ex(ptr_to_compat(from->si_ptr),
&to->si_ptr);
break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
+ case SIL_RT:
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(from->si_int, &to->si_int);
break;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 54b9e89d4d6b..0854ff169274 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -226,10 +226,12 @@ static int enable_start_cpu0;
static void notrace start_secondary(void *unused)
{
/*
- * Don't put *anything* before cpu_init(), SMP booting is too
- * fragile that we want to limit the things done here to the
- * most necessary things.
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
*/
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ __write_cr4(__read_cr4() | X86_CR4_PCIDE);
cpu_init();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
@@ -1461,7 +1463,7 @@ __init void prefill_possible_map(void)
/* nr_cpu_ids could be reduced via nr_cpus= */
if (possible > nr_cpu_ids) {
- pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
+ pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
possible, nr_cpu_ids);
possible = nr_cpu_ids;
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 19adbb418443..0099e10eb045 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -126,16 +126,20 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
/*
- * The existing code assumes virtual address is 48-bit in the canonical
- * address checks; exit if it is ever changed.
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
*/
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best && ((best->eax & 0xff00) >> 8) != 48 &&
- ((best->eax & 0xff00) >> 8) != 0)
- return -EINVAL;
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ kvm_mmu_reset_context(vcpu);
kvm_pmu_refresh(vcpu);
return 0;
@@ -383,7 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+ F(AVX512VBMI) | F(LA57) | F(PKU) |
+ 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -853,16 +858,24 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
}
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
+ bool entry_found = true;
best = kvm_find_cpuid_entry(vcpu, function, index);
- if (!best)
+ if (!best) {
+ entry_found = false;
+ if (!check_limit)
+ goto out;
+
best = check_cpuid_limit(vcpu, function, index);
+ }
+out:
if (best) {
*eax = best->eax;
*ebx = best->ebx;
@@ -870,7 +883,8 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
*edx = best->edx;
} else
*eax = *ebx = *ecx = *edx = 0;
- trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+ return entry_found;
}
EXPORT_SYMBOL_GPL(kvm_cpuid);
@@ -883,7 +897,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index da6728383052..1ea3c0e1e3a9 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -3,6 +3,7 @@
#include "x86.h"
#include <asm/cpu.h>
+#include <asm/processor.h>
int kvm_update_cpuid(struct kvm_vcpu *vcpu);
bool kvm_mpx_supported(void);
@@ -20,7 +21,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
-void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
@@ -29,95 +31,87 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
return vcpu->arch.maxphyaddr;
}
-static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- if (!static_cpu_has(X86_FEATURE_XSAVE))
- return false;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->edx & bit(X86_FEATURE_MTRR));
-}
-
-static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
-}
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
-static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_SMEP));
-}
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
+ [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+};
-static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu)
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_SMAP));
-}
+ unsigned x86_leaf = x86_feature / 32;
-static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
+ BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+ return reverse_cpuid[x86_leaf];
}
-static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ecx & bit(X86_FEATURE_PKU));
-}
+ struct kvm_cpuid_entry2 *entry;
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
+ entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- return best && (best->edx & bit(X86_FEATURE_LM));
+ switch (cpuid.reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
}
-static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
{
- struct kvm_cpuid_entry2 *best;
+ int *reg;
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- return best && (best->ecx & bit(X86_FEATURE_OSVW));
-}
+ if (x86_feature == X86_FEATURE_XSAVE &&
+ !static_cpu_has(X86_FEATURE_XSAVE))
+ return false;
-static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (!reg)
+ return false;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_PCID));
+ return *reg & bit(x86_feature);
}
-static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
{
- struct kvm_cpuid_entry2 *best;
+ int *reg;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_X2APIC));
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (reg)
+ *reg &= ~bit(x86_feature);
}
static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
@@ -128,58 +122,6 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
}
-static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- return best && (best->edx & bit(X86_FEATURE_GBPAGES));
-}
-
-static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_RTM));
-}
-
-static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_MPX));
-}
-
-static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- return best && (best->edx & bit(X86_FEATURE_RDTSCP));
-}
-
-/*
- * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
- */
-#define BIT_NRIPS 3
-
-static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
-
- /*
- * NRIPS is a scattered cpuid feature, so we can't use
- * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
- * position 8, not 3).
- */
- return best && (best->edx & bit(BIT_NRIPS));
-}
-#undef BIT_NRIPS
-
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb0055953fbc..16bf6655aa85 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -28,6 +28,7 @@
#include "x86.h"
#include "tss.h"
+#include "mmu.h"
/*
* Operand types
@@ -688,16 +689,18 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
ulong la;
u32 lim;
u16 sel;
+ u8 va_bits;
la = seg_base(ctxt, addr.seg) + addr.ea;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
*linear = la;
- if (is_noncanonical_address(la))
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (get_canonical(la, va_bits) != la)
goto bad;
- *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
if (size > *max_size)
goto bad;
break;
@@ -1748,8 +1751,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
sizeof(base3), &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
return ret;
- if (is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32)))
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt))
return emulate_gp(ctxt, 0);
}
load:
@@ -2333,7 +2336,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
eax = 0x80000001;
ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
}
@@ -2636,7 +2639,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
@@ -2656,7 +2659,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
eax = 0x00000000;
ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
/*
* Intel ("GenuineIntel")
* remark: Intel CPUs only support "syscall" in 64bit
@@ -2840,8 +2843,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
- if (is_noncanonical_address(rcx) ||
- is_noncanonical_address(rdx))
+ if (emul_is_noncanonical_address(rcx, ctxt) ||
+ emul_is_noncanonical_address(rdx, ctxt))
return emulate_gp(ctxt, 0);
break;
}
@@ -3551,7 +3554,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
/*
* Check MOVBE is set in the guest-visible CPUID leaf.
*/
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
if (!(ecx & FFL(MOVBE)))
return emulate_ud(ctxt);
@@ -3756,7 +3759,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->mode == X86EMUL_MODE_PROT64 &&
- is_noncanonical_address(desc_ptr.address))
+ emul_is_noncanonical_address(desc_ptr.address, ctxt))
return emulate_gp(ctxt, 0);
if (lgdt)
ctxt->ops->set_gdt(ctxt, &desc_ptr);
@@ -3865,7 +3868,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
eax = reg_read(ctxt, VCPU_REGS_RAX);
ecx = reg_read(ctxt, VCPU_REGS_RCX);
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
*reg_write(ctxt, VCPU_REGS_RAX) = eax;
*reg_write(ctxt, VCPU_REGS_RBX) = ebx;
*reg_write(ctxt, VCPU_REGS_RCX) = ecx;
@@ -3924,7 +3927,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
{
u32 eax = 1, ebx, ecx = 0, edx;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
if (!(edx & FFL(FXSR)))
return emulate_ud(ctxt);
@@ -4097,8 +4100,17 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
u64 rsvd = 0;
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
+ if (efer & EFER_LMA) {
+ u64 maxphyaddr;
+ u32 eax = 0x80000008;
+
+ if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL,
+ NULL, false))
+ maxphyaddr = eax & 0xff;
+ else
+ maxphyaddr = 36;
+ rsvd = rsvd_bits(maxphyaddr, 62);
+ }
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 337b6d2730fa..dc97f2544b6f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1160,6 +1160,12 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
pdata);
}
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = APIC_BUS_FREQUENCY;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -1268,7 +1274,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
switch (code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- kvm_vcpu_on_spin(vcpu);
+ kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_POST_MESSAGE:
case HVCALL_SIGNAL_EVENT:
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4af75..9add410f195f 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 589dcc117086..aaf10b6f5380 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -54,8 +54,6 @@
#define PRIu64 "u"
#define PRIo64 "o"
-#define APIC_BUS_CYCLE_NS 1
-
/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
#define apic_debug(fmt, arg...)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 29caa2c3dff9..215721e1426a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,9 @@
#define KVM_APIC_SHORT_MASK 0xc0000
#define KVM_APIC_DEST_MASK 0x800
+#define APIC_BUS_CYCLE_NS 1
+#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 04d750813c9d..eca30c1eb1d9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2169,8 +2169,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
}
struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
- unsigned int idx[PT64_ROOT_LEVEL];
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
};
#define for_each_sp(pvec, sp, parents, i) \
@@ -2385,8 +2385,8 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
- if (iterator->level == PT64_ROOT_LEVEL &&
- vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+ if (iterator->level == PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu.direct_map)
--iterator->level;
@@ -2610,9 +2610,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
sp = list_last_entry(&kvm->arch.active_mmu_pages,
struct kvm_mmu_page, link);
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-
- return true;
+ return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
}
/*
@@ -3262,7 +3260,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
-static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
gfn_t gfn, bool prefault)
@@ -3302,7 +3300,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- make_mmu_pages_available(vcpu);
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
@@ -3326,8 +3325,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
- (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -3379,10 +3378,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
struct kvm_mmu_page *sp;
unsigned i;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
- make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
+ if(make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+ }
+ sp = kvm_mmu_get_page(vcpu, 0, 0,
+ vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3392,7 +3395,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- make_mmu_pages_available(vcpu);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+ }
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
root = __pa(sp->spt);
@@ -3423,15 +3429,18 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
- make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+ }
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3445,7 +3454,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
for (i = 0; i < 4; ++i) {
@@ -3463,7 +3472,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
- make_mmu_pages_available(vcpu);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+ }
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
0, ACC_ALL);
root = __pa(sp->spt);
@@ -3478,7 +3490,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* If we shadow a 32 bit page table with a long mode page
* table we enter this path.
*/
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
if (vcpu->arch.mmu.lm_root == NULL) {
/*
* The additional page necessary for this is only
@@ -3523,7 +3535,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -3588,6 +3600,13 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
if (direct)
return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3599,7 +3618,7 @@ static bool
walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
- u64 sptes[PT64_ROOT_LEVEL], spte = 0ull;
+ u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
int root, leaf;
bool reserved = false;
@@ -3640,7 +3659,23 @@ exit:
return reserved;
}
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+/*
+ * Return values of handle_mmio_page_fault:
+ * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
+ * directly.
+ * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
+ * fault path update the mmio spte.
+ * RET_MMIO_PF_RETRY: let CPU fault again on the address.
+ * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
+ */
+enum {
+ RET_MMIO_PF_EMULATE = 1,
+ RET_MMIO_PF_INVALID = 2,
+ RET_MMIO_PF_RETRY = 0,
+ RET_MMIO_PF_BUG = -1
+};
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
@@ -3872,7 +3907,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- make_mmu_pages_available(vcpu);
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
@@ -4025,7 +4061,13 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
- case PT64_ROOT_LEVEL:
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
rsvd_bits(maxphyaddr, 51);
@@ -4055,7 +4097,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
{
__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
cpuid_maxphyaddr(vcpu), context->root_level,
- context->nx, guest_cpuid_has_gbpages(vcpu),
+ context->nx,
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), guest_cpuid_is_amd(vcpu));
}
@@ -4065,6 +4108,8 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
{
u64 bad_mt_xwr;
+ rsvd_check->rsvd_bits_mask[0][4] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][2] =
@@ -4074,6 +4119,7 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
/* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
rsvd_check->rsvd_bits_mask[1][2] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
@@ -4120,8 +4166,8 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, uses_nx,
- guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
- true);
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse(vcpu), true);
if (!shadow_me_mask)
return;
@@ -4185,66 +4231,85 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
boot_cpu_data.x86_phys_bits, execonly);
}
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
static void update_permission_bitmask(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu, bool ept)
{
- unsigned bit, byte, pfec;
- u8 map;
- bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0;
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
+ bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
+ bool cr0_wp = is_write_protection(vcpu);
- cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
- pfec = byte << 1;
- map = 0;
- wf = pfec & PFERR_WRITE_MASK;
- uf = pfec & PFERR_USER_MASK;
- ff = pfec & PFERR_FETCH_MASK;
+ unsigned pfec = byte << 1;
+
/*
- * PFERR_RSVD_MASK bit is set in PFEC if the access is not
- * subject to SMAP restrictions, and cleared otherwise. The
- * bit is only meaningful if the SMAP bit is set in CR4.
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
*/
- smapf = !(pfec & PFERR_RSVD_MASK);
- for (bit = 0; bit < 8; ++bit) {
- x = bit & ACC_EXEC_MASK;
- w = bit & ACC_WRITE_MASK;
- u = bit & ACC_USER_MASK;
-
- if (!ept) {
- /* Not really needed: !nx will cause pte.nx to fault */
- x |= !mmu->nx;
- /* Allow supervisor writes if !cr0.wp */
- w |= !is_write_protection(vcpu) && !uf;
- /* Disallow supervisor fetches of user code if cr4.smep */
- x &= !(cr4_smep && u && !uf);
-
- /*
- * SMAP:kernel-mode data accesses from user-mode
- * mappings should fault. A fault is considered
- * as a SMAP violation if all of the following
- * conditions are ture:
- * - X86_CR4_SMAP is set in CR4
- * - A user page is accessed
- * - Page fault in kernel mode
- * - if CPL = 3 or X86_EFLAGS_AC is clear
- *
- * Here, we cover the first three conditions.
- * The fourth is computed dynamically in
- * permission_fault() and is in smapf.
- *
- * Also, SMAP does not affect instruction
- * fetches, add the !ff check here to make it
- * clearer.
- */
- smap = cr4_smap && u && !uf && !ff;
- }
- fault = (ff && !x) || (uf && !u) || (wf && !w) ||
- (smapf && smap);
- map |= fault << bit;
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!mmu->nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are ture:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - Page fault in kernel mode
+ * - if CPL = 3 or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first three conditions.
+ * The fourth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
}
- mmu->permissions[byte] = map;
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
}
}
@@ -4358,7 +4423,10 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
static void paging64_init_context(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+ int root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+
+ paging64_init_context_common(vcpu, context, root_level);
}
static void paging32_init_context(struct kvm_vcpu *vcpu,
@@ -4399,7 +4467,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
context->root_hpa = INVALID_PAGE;
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
@@ -4413,7 +4481,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
context->nx = is_nx(vcpu);
- context->root_level = PT64_ROOT_LEVEL;
+ context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, context);
context->gva_to_gpa = paging64_gva_to_gpa;
} else if (is_pae(vcpu)) {
@@ -4470,7 +4539,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
MMU_WARN_ON(VALID_PAGE(context->root_hpa));
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->shadow_root_level = PT64_ROOT_4LEVEL;
context->nx = true;
context->ept_ad = accessed_dirty;
@@ -4479,7 +4548,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
context->update_pte = ept_update_pte;
- context->root_level = context->shadow_root_level;
+ context->root_level = PT64_ROOT_4LEVEL;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
context->base_role.ad_disabled = !accessed_dirty;
@@ -4524,7 +4593,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
} else if (is_long_mode(vcpu)) {
g_context->nx = is_nx(vcpu);
- g_context->root_level = PT64_ROOT_LEVEL;
+ g_context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
} else if (is_pae(vcpu)) {
@@ -4814,12 +4884,12 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
LIST_HEAD(invalid_list);
if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
- return;
+ return 0;
while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
@@ -4828,6 +4898,10 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
++vcpu->kvm->stat.mmu_recycled;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
@@ -4835,7 +4909,13 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
{
int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
- bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ bool direct = vcpu->arch.mmu.direct_map;
+
+ /* With shadow page tables, fault_address contains a GVA or nGPA. */
+ if (vcpu->arch.mmu.direct_map) {
+ vcpu->arch.gpa_available = true;
+ vcpu->arch.gpa_val = cr2;
+ }
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
@@ -4847,6 +4927,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
return 1;
if (r < 0)
return r;
+ /* Must be RET_MMIO_PF_INVALID. */
}
r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
@@ -4862,11 +4943,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* This can occur when using nested virtualization with nested
* paging in both guests. If true, we simply unprotect the page
* and resume the guest.
- *
- * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
- * in PFERR_NEXT_GUEST_PAGE)
*/
- if (error_code == PFERR_NESTED_GUEST_PAGE) {
+ if (vcpu->arch.mmu.direct_map &&
+ (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
return 1;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4b9a3ae6b725..64a2dbd2b1af 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,7 +37,8 @@
#define PT32_DIR_PSE36_MASK \
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-#define PT64_ROOT_LEVEL 4
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
@@ -48,6 +49,9 @@
static inline u64 rsvd_bits(int s, int e)
{
+ if (e < s)
+ return 0;
+
return ((1ULL << (e - s + 1)) - 1) << s;
}
@@ -56,23 +60,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index dcce533d420c..d22ddbdf5e6e 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
- __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
return;
}
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 0149ac59c273..e9ea2d45ae66 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -130,7 +130,7 @@ static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
* enable MTRRs and it is obviously undesirable to run the
* guest entirely with UC memory and we use WB.
*/
- if (guest_cpuid_has_mtrr(vcpu))
+ if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
return MTRR_TYPE_UNCACHABLE;
else
return MTRR_TYPE_WRBACK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b0454c7e4cff..86b68dc5a649 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -790,8 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
&map_writable))
return 0;
- if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
- walker.gfn, pfn, walker.pte_access, &r))
+ if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
/*
@@ -819,7 +818,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
- make_mmu_pages_available(vcpu);
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8dbd8dbc83eb..2c1cfe68a9af 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -280,9 +280,9 @@ module_param(avic, int, S_IRUGO);
static int vls = true;
module_param(vls, int, 0444);
-/* AVIC VM ID bit masks and lock */
-static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
-static DEFINE_SPINLOCK(avic_vm_id_lock);
+/* enable/disable Virtual GIF */
+static int vgif = true;
+module_param(vgif, int, 0444);
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -479,19 +479,33 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit)
recalc_intercepts(svm);
}
+static inline bool vgif_enabled(struct vcpu_svm *svm)
+{
+ return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+}
+
static inline void enable_gif(struct vcpu_svm *svm)
{
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+ if (vgif_enabled(svm))
+ return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
static unsigned long iopm_base;
@@ -567,10 +581,10 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
-static int get_npt_level(void)
+static int get_npt_level(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
+ return PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
@@ -641,7 +655,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
- bool reinject = vcpu->arch.exception.reinject;
+ bool reinject = vcpu->arch.exception.injected;
u32 error_code = vcpu->arch.exception.error_code;
/*
@@ -973,6 +987,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
static void disable_nmi_singlestep(struct vcpu_svm *svm)
{
svm->nmi_singlestep = false;
+
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
/* Clear our flags if they were not set by the guest */
if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
@@ -989,6 +1004,8 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
*/
#define SVM_VM_DATA_HASH_BITS 8
static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
/* Note:
@@ -1108,6 +1125,13 @@ static __init int svm_hardware_setup(void)
}
}
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
return 0;
err:
@@ -1305,6 +1329,12 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
+ if (vgif) {
+ clr_intercept(svm, INTERCEPT_STGI);
+ clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
mark_all_dirty(svm->vmcb);
enable_gif(svm);
@@ -1387,34 +1417,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
-static inline int avic_get_next_vm_id(void)
-{
- int id;
-
- spin_lock(&avic_vm_id_lock);
-
- /* AVIC VM ID is one-based. */
- id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
- if (id <= AVIC_VM_ID_MASK)
- __set_bit(id, avic_vm_id_bitmap);
- else
- id = -EAGAIN;
-
- spin_unlock(&avic_vm_id_lock);
- return id;
-}
-
-static inline int avic_free_vm_id(int id)
-{
- if (id <= 0 || id > AVIC_VM_ID_MASK)
- return -EINVAL;
-
- spin_lock(&avic_vm_id_lock);
- __clear_bit(id, avic_vm_id_bitmap);
- spin_unlock(&avic_vm_id_lock);
- return 0;
-}
-
static void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -1423,8 +1425,6 @@ static void avic_vm_destroy(struct kvm *kvm)
if (!avic)
return;
- avic_free_vm_id(vm_data->avic_vm_id);
-
if (vm_data->avic_logical_id_table_page)
__free_page(vm_data->avic_logical_id_table_page);
if (vm_data->avic_physical_id_table_page)
@@ -1438,19 +1438,16 @@ static void avic_vm_destroy(struct kvm *kvm)
static int avic_vm_init(struct kvm *kvm)
{
unsigned long flags;
- int vm_id, err = -ENOMEM;
+ int err = -ENOMEM;
struct kvm_arch *vm_data = &kvm->arch;
struct page *p_page;
struct page *l_page;
+ struct kvm_arch *ka;
+ u32 vm_id;
if (!avic)
return 0;
- vm_id = avic_get_next_vm_id();
- if (vm_id < 0)
- return vm_id;
- vm_data->avic_vm_id = (u32)vm_id;
-
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
if (!p_page)
@@ -1468,6 +1465,22 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(l_page));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+ struct kvm *k2 = container_of(ka, struct kvm, arch);
+ struct kvm_arch *vd2 = &k2->arch;
+ if (vd2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ vm_data->avic_vm_id = vm_id;
hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -1580,7 +1593,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
}
init_vmcb(svm);
- kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
if (kvm_vcpu_apicv_active(vcpu) && !init_event)
@@ -2384,7 +2397,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level();
+ vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
@@ -3147,6 +3160,13 @@ static int stgi_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the NMI window; remove it now.
+ */
+ if (vgif_enabled(svm))
+ clr_intercept(svm, INTERCEPT_STGI);
+
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@ -3744,7 +3764,10 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
static int pause_interception(struct vcpu_svm *svm)
{
- kvm_vcpu_on_spin(&(svm->vcpu));
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ bool in_kernel = (svm_get_cpl(vcpu) == 0);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -4228,8 +4251,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
-
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -4682,9 +4703,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
* 1, because that's a separate STGI/VMRUN intercept. The next time we
* get that intercept, this function will be called again though and
- * we'll get the vintr intercept.
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
*/
- if (gif_set(svm) && nested_svm_intr(svm)) {
+ if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -4698,8 +4721,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
- if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
+ }
if (svm->nested.exit_required)
return; /* we're not going to run the guest yet */
@@ -5071,17 +5097,14 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_cpuid_entry2 *entry;
/* Update nrips enabled cache */
- svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
+ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
if (!kvm_vcpu_apicv_active(vcpu))
return;
- entry = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (entry)
- entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
}
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc6754ec5..8a202c49e2a0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -151,8 +151,8 @@ TRACE_EVENT(kvm_fast_mmio,
*/
TRACE_EVENT(kvm_cpuid,
TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
- unsigned long rcx, unsigned long rdx),
- TP_ARGS(function, rax, rbx, rcx, rdx),
+ unsigned long rcx, unsigned long rdx, bool found),
+ TP_ARGS(function, rax, rbx, rcx, rdx, found),
TP_STRUCT__entry(
__field( unsigned int, function )
@@ -160,6 +160,7 @@ TRACE_EVENT(kvm_cpuid,
__field( unsigned long, rbx )
__field( unsigned long, rcx )
__field( unsigned long, rdx )
+ __field( bool, found )
),
TP_fast_assign(
@@ -168,11 +169,13 @@ TRACE_EVENT(kvm_cpuid,
__entry->rbx = rbx;
__entry->rcx = rcx;
__entry->rdx = rdx;
+ __entry->found = found;
),
- TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
__entry->function, __entry->rax,
- __entry->rbx, __entry->rcx, __entry->rdx)
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found")
);
#define AREG(x) { APIC_##x, "APIC_" #x }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70b90c0810d0..699704d4bc9e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -122,7 +122,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -243,11 +243,13 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 posted_intr_desc_addr;
+ u64 vm_function_control;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
u64 eoi_exit_bitmap2;
u64 eoi_exit_bitmap3;
+ u64 eptp_list_address;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -481,6 +483,7 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed0;
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
+ u64 nested_vmx_vmfunc_controls;
};
#define POSTED_INTR_ON 0
@@ -573,6 +576,8 @@ struct vcpu_vmx {
#endif
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
+ u32 secondary_exec_control;
+
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -761,11 +766,13 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
FIELD64(EPT_POINTER, ept_pointer),
FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -889,25 +896,6 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->nested.cached_vmcs12;
}
-static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
-{
- struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
- if (is_error_page(page))
- return NULL;
-
- return page;
-}
-
-static void nested_release_page(struct page *page)
-{
- kvm_release_page_dirty(page);
-}
-
-static void nested_release_page_clean(struct page *page)
-{
- kvm_release_page_clean(page);
-}
-
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@ -1212,6 +1200,16 @@ static inline bool cpu_has_vmx_ept_4levels(void)
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}
+static inline bool cpu_has_vmx_ept_mt_wb(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -1317,6 +1315,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
SECONDARY_EXEC_TSC_SCALING;
}
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1357,8 +1361,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
{
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
- vmx_xsaves_supported();
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
}
static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
@@ -1391,6 +1394,18 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2450,15 +2465,14 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
*/
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
if (nr == PF_VECTOR) {
if (vcpu->arch.exception.nested_apf) {
- nested_vmx_inject_exception_vmexit(vcpu,
- vcpu->arch.apf.nested_apf_token);
+ *exit_qual = vcpu->arch.apf.nested_apf_token;
return 1;
}
/*
@@ -2472,16 +2486,15 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
*/
if (nested_vmx_is_page_fault_vmexit(vmcs12,
vcpu->arch.exception.error_code)) {
- nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+ *exit_qual = vcpu->arch.cr2;
return 1;
}
} else {
- unsigned long exit_qual = 0;
- if (nr == DB_VECTOR)
- exit_qual = vcpu->arch.dr6;
-
if (vmcs12->exception_bitmap & (1u << nr)) {
- nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ if (nr == DB_VECTOR)
+ *exit_qual = vcpu->arch.dr6;
+ else
+ *exit_qual = 0;
return 1;
}
}
@@ -2494,14 +2507,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
- bool reinject = vcpu->arch.exception.reinject;
u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
- if (!reinject && is_guest_mode(vcpu) &&
- nested_vmx_check_exception(vcpu))
- return;
-
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -2600,7 +2608,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
+ if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
@@ -2660,12 +2668,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
-}
-
/*
* nested_vmx_allowed() checks whether a guest should be allowed to use VMX
* instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
@@ -2674,7 +2676,7 @@ static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
*/
static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
{
- return nested && guest_cpuid_has_vmx(vcpu);
+ return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
/*
@@ -2797,21 +2799,21 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
- /* secondary cpu-based controls */
+ /*
+ * secondary cpu-based controls. Do not include those that
+ * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
vmx->nested.nested_vmx_secondary_ctls_low,
vmx->nested.nested_vmx_secondary_ctls_high);
vmx->nested.nested_vmx_secondary_ctls_low = 0;
vmx->nested.nested_vmx_secondary_ctls_high &=
- SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_WBINVD_EXITING;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -2834,6 +2836,17 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ if (cpu_has_vmx_vmfunc()) {
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ /*
+ * Advertise EPTP switching unconditionally
+ * since we emulate it
+ */
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
+ }
+
/*
* Old versions of KVM use the single-context version without
* checking for support, so declare that it is supported even
@@ -3203,6 +3216,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = vmx->nested.nested_vmx_ept_caps |
((u64)vmx->nested.nested_vmx_vpid_caps << 32);
break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = vmx->nested.nested_vmx_vmfunc_controls;
+ break;
default:
return 1;
}
@@ -3256,7 +3272,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
- (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -3280,7 +3297,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Otherwise falls through */
default:
@@ -3339,9 +3357,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
- (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK) ||
+ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
@@ -3402,7 +3421,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -3639,8 +3659,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_RDSEED |
+ SECONDARY_EXEC_RDRAND |
SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_VMFUNC;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4272,16 +4295,22 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+ if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ return 5;
+ return 4;
+}
+
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
- u64 eptp;
+ u64 eptp = VMX_EPTP_MT_WB;
+
+ eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
- /* TODO write the value reading from MSR */
- eptp = VMX_EPT_DEFAULT_MT |
- VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPT_AD_ENABLE_BIT;
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
return eptp;
@@ -5163,7 +5192,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- native_store_idt(&dt);
+ store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
vmx->host_idt_base = dt.address;
@@ -5243,10 +5272,24 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
-static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+static bool vmx_rdrand_supported(void)
{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDRAND;
+}
+
+static bool vmx_rdseed_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDSEED;
+}
+
+static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
+ if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -5260,7 +5303,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!kvm_vcpu_apicv_active(&vmx->vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -5274,7 +5317,92 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- return exec_control;
+ if (vmx_xsaves_supported()) {
+ /* Exposing XSAVES only when XSAVE is exposed */
+ bool xsaves_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+
+ if (!xsaves_enabled)
+ exec_control &= ~SECONDARY_EXEC_XSAVES;
+
+ if (nested) {
+ if (xsaves_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_XSAVES;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_XSAVES;
+ }
+ }
+
+ if (vmx_rdtscp_supported()) {
+ bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
+ if (!rdtscp_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
+ }
+ }
+
+ if (vmx_invpcid_supported()) {
+ /* Exposing INVPCID only when PCID is exposed */
+ bool invpcid_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PCID);
+
+ if (!invpcid_enabled) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ }
+
+ if (nested) {
+ if (invpcid_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_INVPCID;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_ENABLE_INVPCID;
+ }
+ }
+
+ if (vmx_rdrand_supported()) {
+ bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
+ if (rdrand_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDRAND;
+
+ if (nested) {
+ if (rdrand_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDRAND;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDRAND;
+ }
+ }
+
+ if (vmx_rdseed_supported()) {
+ bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
+ if (rdseed_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDSEED;
+
+ if (nested) {
+ if (rdseed_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDSEED;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDSEED;
+ }
+ }
+
+ vmx->secondary_exec_control = exec_control;
}
static void ept_set_mmio_spte_mask(void)
@@ -5318,8 +5446,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- vmx_secondary_exec_control(vmx));
+ vmx->secondary_exec_control);
}
if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
@@ -5357,6 +5486,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
#endif
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
@@ -5835,6 +5967,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
static int handle_triple_fault(struct kvm_vcpu *vcpu)
{
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
return 0;
}
@@ -6330,7 +6463,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
- u32 error_code;
+ u64 error_code;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -6362,9 +6495,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
EPT_VIOLATION_EXECUTABLE))
? PFERR_PRESENT_MASK : 0;
- vcpu->arch.gpa_available = true;
- vcpu->arch.exit_qualification = exit_qualification;
+ error_code |= (exit_qualification & 0x100) != 0 ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+ vcpu->arch.exit_qualification = exit_qualification;
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
@@ -6373,23 +6507,20 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
int ret;
gpa_t gpa;
+ /*
+ * A nested guest cannot optimize MMIO vmexits, because we have an
+ * nGPA here instead of the required GPA.
+ */
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ if (!is_guest_mode(vcpu) &&
+ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
trace_kvm_fast_mmio(gpa);
return kvm_skip_emulated_instruction(vcpu);
}
- ret = handle_mmio_page_fault(vcpu, gpa, true);
- vcpu->arch.gpa_available = true;
- if (likely(ret == RET_MMIO_PF_EMULATE))
- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
- EMULATE_DONE;
-
- if (unlikely(ret == RET_MMIO_PF_INVALID))
- return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
-
- if (unlikely(ret == RET_MMIO_PF_RETRY))
- return 1;
+ ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+ if (ret >= 0)
+ return ret;
/* It is the real ept misconfig */
WARN_ON(1);
@@ -6611,7 +6742,8 @@ static __init int hardware_setup(void)
init_vmcs_shadow_fields();
if (!cpu_has_vmx_ept() ||
- !cpu_has_vmx_ept_4levels()) {
+ !cpu_has_vmx_ept_4levels() ||
+ !cpu_has_vmx_ept_mt_wb()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
enable_ept_ad_bits = 0;
@@ -6754,7 +6886,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
if (ple_gap)
grow_ple_window(vcpu);
- kvm_vcpu_on_spin(vcpu);
+ /*
+ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+ * VM-execution control is ignored if CPL > 0. OTOH, KVM
+ * never set PAUSE_EXITING and just set PLE if supported,
+ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+ */
+ kvm_vcpu_on_spin(vcpu, true);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -6769,6 +6907,12 @@ static int handle_mwait(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
return 1;
@@ -6985,7 +7129,7 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
- exn = is_noncanonical_address(*ret);
+ exn = is_noncanonical_address(*ret, vcpu);
} else if (is_protmode(vcpu)) {
/* Protected mode: apply checks for segment validity in the
* following order:
@@ -7149,19 +7293,19 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
if (*(u32 *)kmap(page) != VMCS12_REVISION) {
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
@@ -7242,16 +7386,16 @@ static void free_nested(struct vcpu_vmx *vmx)
kfree(vmx->nested.cached_vmcs12);
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
- nested_release_page(vmx->nested.apic_access_page);
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
if (vmx->nested.virtual_apic_page) {
- nested_release_page(vmx->nested.virtual_apic_page);
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
- nested_release_page(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
vmx->nested.pi_desc_page = NULL;
vmx->nested.pi_desc = NULL;
}
@@ -7618,15 +7762,15 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (vmx->nested.current_vmptr != vmptr) {
struct vmcs12 *new_vmcs12;
struct page *page;
- page = nested_get_page(vcpu, vmptr);
- if (page == NULL) {
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
if (new_vmcs12->revision_id != VMCS12_REVISION) {
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
return kvm_skip_emulated_instruction(vcpu);
@@ -7639,7 +7783,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
*/
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
set_current_vmptr(vmx, vmptr);
}
@@ -7790,7 +7934,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
- if (is_noncanonical_address(operand.gla)) {
+ if (is_noncanonical_address(operand.gla, vcpu)) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
@@ -7847,6 +7991,124 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
return 1;
}
+static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ /* Check for memory type validity */
+ switch (address & VMX_EPTP_MT_MASK) {
+ case VMX_EPTP_MT_UC:
+ if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
+ return false;
+ break;
+ case VMX_EPTP_MT_WB:
+ if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* only 4 levels page-walk length are valid */
+ if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
+ return false;
+
+ /* Reserved bits should not be set */
+ if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+ return false;
+
+ /* AD, if set, should be supported */
+ if (address & VMX_EPTP_AD_ENABLE_BIT) {
+ if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
+ return false;
+ }
+
+ return true;
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 address;
+ bool accessed_dirty;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!nested_cpu_has_eptp_switching(vmcs12) ||
+ !nested_cpu_has_ept(vmcs12))
+ return 1;
+
+ if (index >= VMFUNC_EPTP_ENTRIES)
+ return 1;
+
+
+ if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+ &address, index * 8, 8))
+ return 1;
+
+ accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
+
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != address) {
+ if (!valid_ept_address(vcpu, address))
+ return 1;
+
+ kvm_mmu_unload(vcpu);
+ mmu->ept_ad = accessed_dirty;
+ mmu->base_role.ad_disabled = !accessed_dirty;
+ vmcs12->ept_pointer = address;
+ /*
+ * TODO: Check what's the correct approach in case
+ * mmu reload fails. Currently, we just let the next
+ * reload potentially fail
+ */
+ kvm_mmu_reload(vcpu);
+ }
+
+ return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+ /*
+ * VMFUNC is only supported for nested guests, but we always enable the
+ * secondary control for simplicity; for non-nested mode, fake that we
+ * didn't by injecting #UD.
+ */
+ if (!is_guest_mode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ goto fail;
+
+ switch (function) {
+ case 0:
+ if (nested_vmx_eptp_switching(vcpu, vmcs12))
+ goto fail;
+ break;
+ default:
+ goto fail;
+ }
+ return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+ nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7894,9 +8156,12 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
[EXIT_REASON_INVVPID] = handle_invvpid,
+ [EXIT_REASON_RDRAND] = handle_invalid_op,
+ [EXIT_REASON_RDSEED] = handle_invalid_op,
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_VMFUNC] = handle_vmfunc,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
@@ -8212,6 +8477,10 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
* table is L0's fault.
*/
return false;
+ case EXIT_REASON_INVPCID:
+ return
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+ nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
case EXIT_REASON_XSETBV:
@@ -8229,6 +8498,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_PML_FULL:
/* We emulate PML support to L1. */
return false;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return false;
default:
return true;
}
@@ -8487,7 +8759,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 vectoring_info = vmx->idt_vectoring_info;
trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
- vcpu->arch.gpa_available = false;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -9341,11 +9612,6 @@ static void __init vmx_check_processor_compat(void *rtn)
}
}
-static int get_ept_level(void)
-{
- return VMX_EPT_DEFAULT_GAW + 1;
-}
-
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -9462,39 +9728,13 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
- if (vmx_rdtscp_supported()) {
- bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
- if (!rdtscp_enabled)
- secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
-
- if (nested) {
- if (rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDTSCP;
- else
- vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
- }
- }
-
- /* Exposing INVPCID only when PCID is exposed */
- best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
- if (vmx_invpcid_supported() &&
- (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
- !guest_cpuid_has_pcid(vcpu))) {
- secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-
- if (best)
- best->ebx &= ~bit(X86_FEATURE_INVPCID);
+ if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
+ vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
}
- if (cpu_has_secondary_exec_ctrls())
- vmcs_set_secondary_exec_control(secondary_exec_ctl);
-
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -9535,7 +9775,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
{
- return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
+ return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
}
/* Callbacks for nested_ept_init_mmu_context: */
@@ -9548,18 +9788,15 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- bool wants_ad;
-
WARN_ON(mmu_is_nested(vcpu));
- wants_ad = nested_ept_ad_enabled(vcpu);
- if (wants_ad && !enable_ept_ad_bits)
+ if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
return 1;
kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.nested_vmx_ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
- wants_ad);
+ nested_ept_ad_enabled(vcpu));
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -9610,6 +9847,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page;
u64 hpa;
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -9619,17 +9857,19 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
* physical address remains valid. We keep a reference
* to it so we can release it later.
*/
- if (vmx->nested.apic_access_page) /* shouldn't happen */
- nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page =
- nested_get_page(vcpu, vmcs12->apic_access_addr);
+ if (vmx->nested.apic_access_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
/*
* If translation failed, no matter: This feature asks
* to exit when accessing the given address, and if it
* can never be accessed, this feature won't do
* anything anyway.
*/
- if (vmx->nested.apic_access_page) {
+ if (!is_error_page(page)) {
+ vmx->nested.apic_access_page = page;
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
@@ -9644,10 +9884,11 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- if (vmx->nested.virtual_apic_page) /* shouldn't happen */
- nested_release_page(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page =
- nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
+ if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
/*
* If translation failed, VM entry will fail because
@@ -9662,7 +9903,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
* control. But such a configuration is useless, so
* let's keep the code simple.
*/
- if (vmx->nested.virtual_apic_page) {
+ if (!is_error_page(page)) {
+ vmx->nested.virtual_apic_page = page;
hpa = page_to_phys(vmx->nested.virtual_apic_page);
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
}
@@ -9671,16 +9913,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
if (nested_cpu_has_posted_intr(vmcs12)) {
if (vmx->nested.pi_desc_page) { /* shouldn't happen */
kunmap(vmx->nested.pi_desc_page);
- nested_release_page(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
}
- vmx->nested.pi_desc_page =
- nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
- vmx->nested.pi_desc =
- (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
- if (!vmx->nested.pi_desc) {
- nested_release_page_clean(vmx->nested.pi_desc_page);
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (is_error_page(page))
return;
- }
+ vmx->nested.pi_desc_page = page;
+ vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
vmx->nested.pi_desc =
(struct pi_desc *)((void *)vmx->nested.pi_desc +
(unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -9746,6 +9986,18 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -9762,8 +10014,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
return false;
- page = nested_get_page(vcpu, vmcs12->msr_bitmap);
- if (!page)
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
+ if (is_error_page(page))
return false;
msr_bitmap_l1 = (unsigned long *)kmap(page);
@@ -9793,7 +10045,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
}
}
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
return true;
}
@@ -10187,13 +10439,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
enable_ept ? vmcs12->page_fault_error_code_match : 0);
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmx_secondary_exec_control(vmx);
+ exec_control = vmx->secondary_exec_control;
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -10201,6 +10456,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control |= vmcs12_exec_ctrl;
}
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
vmcs12->eoi_exit_bitmap0);
@@ -10426,6 +10685,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -10453,6 +10715,18 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx->nested.nested_vmx_entry_ctls_high))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_cpu_has_vmfunc(vmcs12)) {
+ if (vmcs12->vm_function_control &
+ ~vmx->nested.nested_vmx_vmfunc_controls)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_eptp_switching(vmcs12)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !page_address_valid(vcpu, vmcs12->eptp_list_address))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ }
+ }
+
if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -10699,7 +10973,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
u32 idt_vectoring;
unsigned int nr;
- if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
+ if (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -10738,12 +11012,20 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qual;
- if (vcpu->arch.exception.pending ||
- vcpu->arch.nmi_injected ||
- vcpu->arch.interrupt.pending)
+ if (kvm_event_needs_reinjection(vcpu))
return -EBUSY;
+ if (vcpu->arch.exception.pending &&
+ nested_vmx_check_exception(vcpu, &exit_qual)) {
+ if (vmx->nested.nested_run_pending)
+ return -EBUSY;
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ vcpu->arch.exception.pending = false;
+ return 0;
+ }
+
if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
vmx->nested.preemption_timer_expired) {
if (vmx->nested.nested_run_pending)
@@ -11184,16 +11466,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
- nested_release_page(vmx->nested.apic_access_page);
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
if (vmx->nested.virtual_apic_page) {
- nested_release_page(vmx->nested.virtual_apic_page);
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
- nested_release_page(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
vmx->nested.pi_desc_page = NULL;
vmx->nested.pi_desc = NULL;
}
@@ -11369,14 +11651,14 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
- page = nested_get_page(vcpu, vmcs12->pml_address);
- if (!page)
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
+ if (is_error_page(page))
return 0;
pml_address = kmap(page);
pml_address[vmcs12->guest_pml_index--] = gpa;
kunmap(page);
- nested_release_page_clean(page);
+ kvm_release_page_clean(page);
}
return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ef5102f80497..6069af86da3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -311,13 +311,13 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
u64 new_state = msr_info->data &
(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
- u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
- 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
+ u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+ if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE)
+ return 1;
if (!msr_info->host_initiated &&
- ((msr_info->data & reserved_bits) != 0 ||
- new_state == X2APIC_ENABLE ||
- (new_state == MSR_IA32_APICBASE_ENABLE &&
+ ((new_state == MSR_IA32_APICBASE_ENABLE &&
old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
(new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
old_state == 0)))
@@ -390,15 +390,28 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
- if (!vcpu->arch.exception.pending) {
+ if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
if (has_error && !is_protmode(vcpu))
has_error = false;
- vcpu->arch.exception.pending = true;
+ if (reinject) {
+ /*
+ * On vmentry, vcpu->arch.exception.pending is only
+ * true if an event injection was blocked by
+ * nested_run_pending. In that case, however,
+ * vcpu_enter_guest requests an immediate exit,
+ * and the guest shouldn't proceed far enough to
+ * need reinjection.
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending);
+ vcpu->arch.exception.injected = true;
+ } else {
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ }
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
- vcpu->arch.exception.reinject = reinject;
return;
}
@@ -413,8 +426,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
class2 = exception_class(nr);
if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
- /* generate double fault per SDM Table 5-5 */
+ /*
+ * Generate double fault per SDM Table 5-5. Set
+ * exception.pending = true so that the double fault
+ * can trigger a nested vmexit.
+ */
vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
@@ -755,19 +773,22 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (cr4 & CR4_RESERVED_BITS)
return 1;
- if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+ return 1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
return 1;
- if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
return 1;
- if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
return 1;
- if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
return 1;
- if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
return 1;
if (is_long_mode(vcpu)) {
@@ -780,7 +801,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
- if (!guest_cpuid_has_pcid(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
@@ -814,10 +835,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
- } else if (is_pae(vcpu) && is_paging(vcpu) &&
+ if (is_long_mode(vcpu) &&
+ (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+ return 1;
+ else if (is_pae(vcpu) && is_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
@@ -884,7 +905,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
u64 fixed = DR6_FIXED_1;
- if (!guest_cpuid_has_rtm(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
return fixed;
}
@@ -994,6 +1015,7 @@ static u32 emulated_msrs[] = {
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
HV_X64_MSR_RESET,
@@ -1022,21 +1044,11 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
if (efer & efer_reserved_bits)
return false;
- if (efer & EFER_FFXSR) {
- struct kvm_cpuid_entry2 *feat;
-
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+ if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
- }
- if (efer & EFER_SVME) {
- struct kvm_cpuid_entry2 *feat;
-
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+ if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
return false;
- }
return true;
}
@@ -1084,7 +1096,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
- if (is_noncanonical_address(msr->data))
+ if (is_noncanonical_address(msr->data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
@@ -1101,7 +1113,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- msr->data = get_canonical(msr->data);
+ msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
}
return kvm_x86_ops->set_msr(vcpu, msr);
}
@@ -1534,8 +1546,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+ if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
update_ia32_tsc_adjust_msr(vcpu, offset);
+
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -2185,7 +2198,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
- if (guest_cpuid_has_tsc_adjust(vcpu)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
@@ -2307,12 +2320,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has_osvw(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.length = data;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has_osvw(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.status = data;
break;
@@ -2537,12 +2550,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has_osvw(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has_osvw(vcpu))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.status;
break;
@@ -2882,6 +2895,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
+
+ if (vcpu->preempted)
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+
/*
* Disable page faults because we're in atomic context here.
* kvm_write_guest_offset_cached() would call might_fault()
@@ -3074,8 +3091,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
process_nmi(vcpu);
+ /*
+ * FIXME: pass injected and pending separately. This is only
+ * needed for nested virtualization, whose state cannot be
+ * migrated yet. For now we can combine them.
+ */
events->exception.injected =
- vcpu->arch.exception.pending &&
+ (vcpu->arch.exception.pending ||
+ vcpu->arch.exception.injected) &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
@@ -3130,6 +3153,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return -EINVAL;
process_nmi(vcpu);
+ vcpu->arch.exception.injected = false;
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -4671,25 +4695,18 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
*/
if (vcpu->arch.gpa_available &&
emulator_can_use_gpa(ctxt) &&
- vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
- (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
- gpa = exception->address;
- goto mmio;
+ (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
+ gpa = vcpu->arch.gpa_val;
+ ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
+ } else {
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
+ if (ret < 0)
+ return X86EMUL_PROPAGATE_FAULT;
}
- ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
-
- if (ret < 0)
- return X86EMUL_PROPAGATE_FAULT;
-
- /* For APIC access vmexit */
- if (ret)
- goto mmio;
-
- if (ops->read_write_emulate(vcpu, gpa, val, bytes))
+ if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
-mmio:
/*
* Is this MMIO handled locally?
*/
@@ -5227,10 +5244,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
-static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
{
- kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
@@ -6362,11 +6379,42 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
int r;
/* try to reinject previous events if any */
+ if (vcpu->arch.exception.injected) {
+ kvm_x86_ops->queue_exception(vcpu);
+ return 0;
+ }
+
+ /*
+ * Exceptions must be injected immediately, or the exception
+ * frame will have the address of the NMI or interrupt handler.
+ */
+ if (!vcpu->arch.exception.pending) {
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops->set_nmi(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ kvm_x86_ops->set_irq(vcpu);
+ return 0;
+ }
+ }
+
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
+
+ /* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
@@ -6378,27 +6426,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- return 0;
- }
-
- if (vcpu->arch.nmi_injected) {
- kvm_x86_ops->set_nmi(vcpu);
- return 0;
- }
-
- if (vcpu->arch.interrupt.pending) {
- kvm_x86_ops->set_irq(vcpu);
- return 0;
- }
-
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
- if (r != 0)
- return r;
- }
-
- /* try to inject new event if pending */
- if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6615,7 +6643,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
- if (guest_cpuid_has_longmode(vcpu))
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
@@ -6667,7 +6695,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
- if (guest_cpuid_has_longmode(vcpu))
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
kvm_update_cpuid(vcpu);
@@ -6774,6 +6802,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
r = 0;
goto out;
}
@@ -6862,6 +6891,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
+ WARN_ON(vcpu->arch.exception.pending);
}
if (kvm_lapic_enabled(vcpu)) {
@@ -7004,6 +7034,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
+ vcpu->arch.gpa_available = false;
r = kvm_x86_ops->handle_exit(vcpu);
return r;
@@ -7422,7 +7453,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits, idx;
struct desc_ptr dt;
- if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ (sregs->cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
+ apic_base_msr.data = sregs->apic_base;
+ apic_base_msr.host_initiated = true;
+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
return -EINVAL;
dt.size = sregs->idt.limit;
@@ -7441,9 +7478,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
kvm_x86_ops->set_efer(vcpu, sregs->efer);
- apic_base_msr.data = sregs->apic_base;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
@@ -7734,6 +7768,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
+ vcpu->arch.exception.pending = false;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
@@ -7993,6 +8028,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_pmu_init(vcpu);
vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
kvm_hv_vcpu_init(vcpu);
@@ -8440,6 +8476,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 612067074905..51e349cf5f45 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -11,7 +11,7 @@
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
- vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = false;
}
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
@@ -29,7 +29,7 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending ||
vcpu->arch.nmi_injected;
}
@@ -62,6 +62,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
return cs_l;
}
+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return (vcpu->arch.efer & EFER_LMA) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+ return 0;
+#endif
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
@@ -87,10 +97,48 @@ static inline u32 bit(int bitno)
return 1 << (bitno & 31);
}
+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u64 get_canonical(u64 la, u8 vaddr_bits)
+{
+ return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+#else
+ return false;
+#endif
+}
+
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
gva_t gva, gfn_t gfn, unsigned access)
{
- vcpu->arch.mmio_gva = gva & PAGE_MASK;
+ /*
+ * If this is a shadow nested page table, the "GVA" is
+ * actually a nGPA.
+ */
+ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7777ccc0e9f9..af5c1ed21d43 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -19,6 +19,7 @@
#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
+#include <asm/cpufeature.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void)
}
}
+static void setup_pcid(void)
+{
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_PCID)) {
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() --
+ * the trampoline code can't handle CR4.PCIDE and
+ * it wouldn't do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually
+ * cause the bits in question to remain set all the
+ * way through the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE
+ * manually in start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ }
+ }
+#endif
+}
+
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -592,6 +625,7 @@ void __init init_mem_mapping(void)
unsigned long end;
probe_page_size_mask();
+ setup_pcid();
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..048fbe8fc274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ void __init paging_init(void)
* After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
* updating.
*/
-static void update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
{
unsigned long end_pfn = PFN_UP(start + size);
@@ -772,22 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start, size);
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
return ret;
}
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ init_memory_mapping(start, start + size);
+
+ return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0fbd09269757..3fcc8e01683b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -37,7 +37,7 @@ static char sme_cmdline_off[] __initdata = "off";
* reside in the .data section so as not to be zeroed out when the .bss
* section is later cleared.
*/
-unsigned long sme_me_mask __section(.data) = 0;
+u64 sme_me_mask __section(.data) = 0;
EXPORT_SYMBOL_GPL(sme_me_mask);
/* Buffer used for early in-place encryption by BSP, no locking needed */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 218834a3e9ad..b372f3442bbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -426,10 +426,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
{
int changed = !pte_same(*ptep, entry);
- if (changed && dirty) {
+ if (changed && dirty)
*ptep = entry;
- pte_update(vma->vm_mm, address, ptep);
- }
return changed;
}
@@ -486,9 +484,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
- if (ret)
- pte_update(vma->vm_mm, addr, ptep);
-
return ret;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dbbcfd59726a..1ab3821f9e26 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -121,8 +121,28 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* hypothetical buggy code that directly switches to swapper_pg_dir
* without going through leave_mm() / switch_mm_irqs_off() or that
* does something like write_cr3(read_cr3_pa()).
+ *
+ * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
+ * isn't free.
*/
- VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON_ONCE(__read_cr3() !=
+ (__sme_pa(real_prev->pgd) | prev_asid))) {
+ /*
+ * If we were to BUG here, we'd be very likely to kill
+ * the system so hard that we don't see the call trace.
+ * Try to recover instead by ignoring the error and doing
+ * a global flush to minimize the chance of corruption.
+ *
+ * (This is far from being a fully correct recovery.
+ * Architecturally, the CPU could prefetch something
+ * back into an incorrect ASID slot and leave it there
+ * to cause trouble down the road. It's better than
+ * nothing, though.)
+ */
+ __flush_tlb_all();
+ }
+#endif
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
@@ -241,7 +261,7 @@ void initialize_tlbstate_and_flush(void)
* doesn't work like other CR4 bits because it can only be set from
* long mode.)
*/
- WARN_ON(boot_cpu_has(X86_CR4_PCIDE) &&
+ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 11e407489db0..f2228b150faa 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -618,3 +618,20 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev)
dev_info(dev, "can't work around MacBook Pro poweroff issue\n");
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff);
+
+/*
+ * VMD-enabled root ports will change the source ID for all messages
+ * to the VMD device. Rather than doing device matching with the source
+ * ID, the AER driver should traverse the child device tree, reading
+ * AER registers to find the faulting device.
+ */
+static void quirk_no_aersid(struct pci_dev *pdev)
+{
+ /* VMD Domain */
+ if (is_vmd(pdev->bus))
+ pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9bd115484745..0f5f60b14f48 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1092,7 +1092,7 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __initdata pciirq_dmi_table[] = {
+static const struct dmi_system_id pciirq_dmi_table[] __initconst = {
{
.callback = fix_broken_hp_bios_irq9,
.ident = "HP Pavilion N5400 Series Laptop",
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index b1526b95fd43..2905376559f1 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -11,7 +11,7 @@
*/
#include <linux/gpio.h>
-#include <linux/i2c/tc35876x.h>
+#include <linux/platform_data/tc35876x.h>
#include <asm/intel-mid.h>
/*tc35876x DSI_LVDS bridge chip and panel platform data*/
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 4d68d59f457d..84fcfde53f8f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -428,7 +428,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d)
return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
}
-static struct dmi_system_id msr_save_dmi_table[] = {
+static const struct dmi_system_id msr_save_dmi_table[] = {
{
.callback = msr_initialize_bdw,
.ident = "BROADWELL BDX_EP",
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f2598d81cd55..f910c514438f 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -295,7 +295,26 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
return -EOVERFLOW;
rdr->jump_address = (unsigned long)restore_registers;
rdr->jump_address_phys = __pa_symbol(restore_registers);
- rdr->cr3 = restore_cr3;
+
+ /*
+ * The restore code fixes up CR3 and CR4 in the following sequence:
+ *
+ * [in hibernation asm]
+ * 1. CR3 <= temporary page tables
+ * 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
+ * 3. CR3 <= rdr->cr3
+ * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
+ * [in restore_processor_state()]
+ * 5. CR4 <= saved CR4
+ * 6. CR3 <= saved CR3
+ *
+ * Our mmu_cr4_features has CR4.PCIDE=0, and toggling
+ * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
+ * rdr->cr3 needs to point to valid page tables but must not
+ * have any of the PCID bits set.
+ */
+ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
+
rdr->magic = RESTORE_MAGIC;
hibernation_e820_save(rdr->e820_digest);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ae2a2e2d6362..69b9deff7e5c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1038,7 +1038,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
- .read_cr4 = native_read_cr4,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
@@ -1073,7 +1072,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
- .store_idt = native_store_idt,
.store_tr = xen_store_tr,
.write_ldt_entry = xen_write_ldt_entry,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3be06f3caf3c..3e15345abfe7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -84,7 +84,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
else
rmd->mfn++;
- rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
rmd->mmu_update->val = pte_val_ma(pte);
rmd->mmu_update++;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e437714750f8..509f560bd0c6 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -162,26 +162,6 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
-void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
-{
- struct multicall_space mcs;
- struct mmu_update *u;
-
- trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
-
- mcs = xen_mc_entry(sizeof(*u));
- u = mcs.args;
-
- /* ptep might be kmapped when using 32-bit HIGHPTE */
- u->ptr = virt_to_machine(ptep).maddr;
- u->val = pte_val_ma(pteval);
-
- MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-EXPORT_SYMBOL_GPL(xen_set_domain_pte);
-
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -2429,8 +2409,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
- .pte_update = paravirt_nop,
-
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 276da636dd39..6083ba462f35 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -212,8 +212,7 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p;
- if (xen_feature(XENFEAT_auto_translated_physmap) ||
- xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -269,9 +268,6 @@ void __ref xen_build_mfn_list_list(void)
void xen_setup_mfn_list_list(void)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
@@ -291,9 +287,6 @@ void __init xen_build_dynamic_phys_to_machine(void)
{
unsigned long pfn;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
@@ -540,9 +533,6 @@ int xen_alloc_p2m_entry(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -640,9 +630,6 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (unlikely(pfn_s >= xen_p2m_size))
return 0;
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
- return pfn_e - pfn_s;
-
if (pfn_s > pfn_e)
return 0;
@@ -660,10 +647,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
pte_t *ptep;
unsigned int level;
- /* don't track P2M changes in autotranslate guests */
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
- return true;
-
if (unlikely(pfn >= xen_p2m_size)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
@@ -711,9 +694,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
int i, ret = 0;
pte_t *pte;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
if (kmap_ops) {
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
kmap_ops, count);
@@ -756,9 +736,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
{
int i, ret = 0;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
for (i = 0; i < count; i++) {
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c81046323ebc..ac55c02f98e9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -340,8 +340,6 @@ static void __init xen_do_set_identity_and_remap_chunk(
WARN_ON(size == 0);
- BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-
mfn_save = virt_to_mfn(buf);
for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
@@ -1024,8 +1022,7 @@ void __init xen_pvmmu_arch_setup(void)
void __init xen_arch_setup(void)
{
xen_panic_handler_init();
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_pvmmu_arch_setup();
+ xen_pvmmu_arch_setup();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
OpenPOWER on IntegriCloud