summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/events/intel/uncore.h33
-rw-r--r--arch/x86/events/intel/uncore_snb.c121
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h12
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/qspinlock.h13
-rw-r--r--arch/x86/include/asm/xen/page.h35
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c6
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c11
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/ldt.c59
-rw-r--r--arch/x86/kernel/vsmp_64.c84
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/mmu.c27
-rw-r--r--arch/x86/kvm/svm.c44
-rw-r--r--arch/x86/kvm/vmx.c98
-rw-r--r--arch/x86/kvm/x86.c10
-rw-r--r--arch/x86/xen/mmu_pv.c6
-rw-r--r--arch/x86/xen/p2m.c3
-rw-r--r--arch/x86/xen/spinlock.c14
24 files changed, 385 insertions, 216 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba7e3464ee92..9d734f3c8234 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -525,7 +525,6 @@ config X86_VSMP
bool "ScaleMP vSMP"
select HYPERVISOR_GUEST
select PARAVIRT
- select PARAVIRT_XXL
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
depends on SMP
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5b562e464009..88398fdf8129 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -213,8 +213,6 @@ ifdef CONFIG_X86_64
KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000)
endif
-# Speed up the build
-KBUILD_CFLAGS += -pipe
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
@@ -239,7 +237,7 @@ archheaders:
archmacros:
$(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s
-ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s -Wa,-
+ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s
export ASM_MACRO_FLAGS
KBUILD_CFLAGS += $(ASM_MACRO_FLAGS)
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index e17ab885b1e9..cb46d602a6b8 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -129,8 +129,15 @@ struct intel_uncore_box {
struct intel_uncore_extra_reg shared_regs[0];
};
-#define UNCORE_BOX_FLAG_INITIATED 0
-#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */
+/* CFL uncore 8th cbox MSRs */
+#define CFL_UNC_CBO_7_PERFEVTSEL0 0xf70
+#define CFL_UNC_CBO_7_PER_CTR0 0xf76
+
+#define UNCORE_BOX_FLAG_INITIATED 0
+/* event config registers are 8-byte apart */
+#define UNCORE_BOX_FLAG_CTL_OFFS8 1
+/* CFL 8th CBOX has different MSR space */
+#define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2
struct uncore_event_desc {
struct kobj_attribute attr;
@@ -297,17 +304,27 @@ unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
static inline
unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
{
- return box->pmu->type->event_ctl +
- (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
- uncore_msr_box_offset(box);
+ if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) {
+ return CFL_UNC_CBO_7_PERFEVTSEL0 +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx);
+ } else {
+ return box->pmu->type->event_ctl +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+ }
}
static inline
unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
{
- return box->pmu->type->perf_ctr +
- (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
- uncore_msr_box_offset(box);
+ if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) {
+ return CFL_UNC_CBO_7_PER_CTR0 +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx);
+ } else {
+ return box->pmu->type->perf_ctr +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+ }
}
static inline
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 8527c3e1038b..2593b0d7aeee 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -15,6 +15,25 @@
#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910
#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f
#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f
+#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c
+#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904
+#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914
+#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f
+#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f
+#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc
+#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0
+#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10
+#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4
+#define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f
+#define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f
+#define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2
+#define PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC 0x3e30
+#define PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC 0x3e18
+#define PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC 0x3ec6
+#define PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC 0x3e31
+#define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33
+#define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca
+#define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -202,6 +221,10 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
}
+
+ /* The 8th CBOX has different MSR space */
+ if (box->pmu->pmu_idx == 7)
+ __set_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags);
}
static void skl_uncore_msr_enable_box(struct intel_uncore_box *box)
@@ -228,7 +251,7 @@ static struct intel_uncore_ops skl_uncore_msr_ops = {
static struct intel_uncore_type skl_uncore_cbox = {
.name = "cbox",
.num_counters = 4,
- .num_boxes = 5,
+ .num_boxes = 8,
.perf_ctr_bits = 44,
.fixed_ctr_bits = 48,
.perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
@@ -569,7 +592,82 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
-
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ },
};
@@ -618,6 +716,25 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */
IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */
IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */
+ IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */
+ IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */
+ IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */
+ IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */
+ IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */
+ IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */
+ IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */
+ IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */
+ IMC_DEV(CFL_6H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 6 Cores */
+ IMC_DEV(CFL_2S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 2 Cores Desktop */
+ IMC_DEV(CFL_4S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Desktop */
+ IMC_DEV(CFL_6S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Desktop */
+ IMC_DEV(CFL_8S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Desktop */
+ IMC_DEV(CFL_4S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Work Station */
+ IMC_DEV(CFL_6S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Work Station */
+ IMC_DEV(CFL_8S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Work Station */
+ IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */
+ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */
+ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */
{ /* end marker */ }
};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55e51ff7e421..fbda5a917c5b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ /* Returns actual tsc_offset set in active VMCS */
+ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4da9b1c58d28..c1a812bd5a27 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -221,6 +221,8 @@ static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_am
int mce_available(struct cpuinfo_x86 *c);
bool mce_is_memory_error(struct mce *m);
+bool mce_is_correctable(struct mce *m);
+int mce_usable_address(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 0d6271cce198..1d0a7778e163 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -232,7 +232,7 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
: "cc");
}
#endif
- return hv_status;
+ return hv_status;
}
/*
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index cd0cf1c568b4..8f657286d599 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -33,12 +33,14 @@
/*
* Set __PAGE_OFFSET to the most negative possible address +
- * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
- * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
- * what Xen requires.
+ * PGDIR_SIZE*17 (pgd slot 273).
+ *
+ * The gap is to allow a space for LDT remap for PTI (1 pgd slot) and space for
+ * a hypervisor (16 slots). Choosing 16 slots for a hypervisor is arbitrary,
+ * but it's what Xen requires.
*/
-#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL)
-#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL)
+#define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL)
+#define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL)
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
#define __PAGE_OFFSET page_offset_base
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 04edd2d58211..84bd9bdc1987 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -111,9 +111,7 @@ extern unsigned int ptrs_per_p4d;
*/
#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
-#define LDT_PGD_ENTRY_L4 -3UL
-#define LDT_PGD_ENTRY_L5 -112UL
-#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
+#define LDT_PGD_ENTRY -240UL
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 87623c6b13db..bd5ac6cc37db 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -13,12 +13,15 @@
#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
{
- u32 val = 0;
-
- if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
- "I", _Q_PENDING_OFFSET))
- val |= _Q_PENDING_VAL;
+ u32 val;
+ /*
+ * We can't use GEN_BINARY_RMWcc() inside an if() stmt because asm goto
+ * and CONFIG_PROFILE_ALL_BRANCHES=y results in a label inside a
+ * statement expression, which GCC doesn't like.
+ */
+ val = GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
+ "I", _Q_PENDING_OFFSET) * _Q_PENDING_VAL;
val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
return val;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 123e669bf363..790ce08e41f2 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -9,7 +9,7 @@
#include <linux/mm.h>
#include <linux/device.h>
-#include <linux/uaccess.h>
+#include <asm/extable.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -93,12 +93,39 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
*/
static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
{
- return __put_user(val, (unsigned long __user *)addr);
+ int ret = 0;
+
+ asm volatile("1: mov %[val], %[ptr]\n"
+ "2:\n"
+ ".section .fixup, \"ax\"\n"
+ "3: sub $1, %[ret]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [ret] "+r" (ret), [ptr] "=m" (*addr)
+ : [val] "r" (val));
+
+ return ret;
}
-static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
+static inline int xen_safe_read_ulong(const unsigned long *addr,
+ unsigned long *val)
{
- return __get_user(*val, (unsigned long __user *)addr);
+ int ret = 0;
+ unsigned long rval = ~0ul;
+
+ asm volatile("1: mov %[ptr], %[rval]\n"
+ "2:\n"
+ ".section .fixup, \"ax\"\n"
+ "3: sub $1, %[ret]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [ret] "+r" (ret), [rval] "+r" (rval)
+ : [ptr] "m" (*addr));
+ *val = rval;
+
+ return ret;
}
#ifdef CONFIG_XEN_PV
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8c66d2fc8f81..36d2696c9563 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -485,7 +485,7 @@ static void mce_report_event(struct pt_regs *regs)
* be somewhat complicated (e.g. segment offset would require an instruction
* parser). So only support physical addresses up to page granuality for now.
*/
-static int mce_usable_address(struct mce *m)
+int mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
@@ -505,6 +505,7 @@ static int mce_usable_address(struct mce *m)
return 1;
}
+EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
{
@@ -534,7 +535,7 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
-static bool mce_is_correctable(struct mce *m)
+bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
return false;
@@ -547,6 +548,7 @@ static bool mce_is_correctable(struct mce *m)
return true;
}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
static bool cec_add_mce(struct mce *m)
{
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 1c72f3819eb1..e81a2db42df7 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
+#include <linux/i8253.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -295,6 +296,16 @@ static void __init ms_hyperv_init_platform(void)
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
+ /*
+ * Hyper-V VMs have a PIT emulation quirk such that zeroing the
+ * counter register during PIT shutdown restarts the PIT. So it
+ * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
+ * to false tells pit_shutdown() not to zero the counter so that
+ * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
+ * and setting this value has no effect.
+ */
+ i8253_clear_counter_on_shutdown = false;
+
#if IS_ENABLED(CONFIG_HYPERV)
/*
* Setup the hook to get control post apic initialization.
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index d9ab49bed8af..0eda91f8eeac 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -77,7 +77,7 @@ static __init int setup_vmw_sched_clock(char *s)
}
early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
-static unsigned long long vmware_sched_clock(void)
+static unsigned long long notrace vmware_sched_clock(void)
{
unsigned long long ns;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ab18e0884dc6..6135ae8ce036 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -199,14 +199,6 @@ static void sanity_check_ldt_mapping(struct mm_struct *mm)
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
- *
- * There is no corresponding unmap function. Even if the LDT is freed, we
- * leave the PTEs around until the slot is reused or the mm is destroyed.
- * This is harmless: the LDT is always in ordinary memory, and no one will
- * access the freed slot.
- *
- * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
- * it useful, and the flush would slow down modify_ldt().
*/
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
@@ -214,8 +206,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
unsigned long va;
bool is_vmalloc;
spinlock_t *ptl;
- pgd_t *pgd;
- int i;
+ int i, nr_pages;
if (!static_cpu_has(X86_FEATURE_PTI))
return 0;
@@ -229,16 +220,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
/* Check if the current mappings are sane */
sanity_check_ldt_mapping(mm);
- /*
- * Did we already have the top level entry allocated? We can't
- * use pgd_none() for this because it doens't do anything on
- * 4-level page table kernels.
- */
- pgd = pgd_offset(mm, LDT_BASE_ADDR);
-
is_vmalloc = is_vmalloc_addr(ldt->entries);
- for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
unsigned long offset = i << PAGE_SHIFT;
const void *src = (char *)ldt->entries + offset;
unsigned long pfn;
@@ -272,13 +258,39 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
/* Propagate LDT mapping to the user page-table */
map_ldt_struct_to_user(mm);
- va = (unsigned long)ldt_slot_va(slot);
- flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
-
ldt->slot = slot;
return 0;
}
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ unsigned long va;
+ int i, nr_pages;
+
+ if (!ldt)
+ return;
+
+ /* LDT map/unmap is only required for PTI */
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
+ ptep = get_locked_pte(mm, va, &ptl);
+ pte_clear(mm, va, ptep);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ va = (unsigned long)ldt_slot_va(ldt->slot);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
+}
+
#else /* !CONFIG_PAGE_TABLE_ISOLATION */
static int
@@ -286,6 +298,10 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
return 0;
}
+
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
static void free_ldt_pgtables(struct mm_struct *mm)
@@ -524,6 +540,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
install_ldt(mm, new_ldt);
+ unmap_ldt_struct(mm, old_ldt);
free_ldt_struct(old_ldt);
error = 0;
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 1eae5af491c2..891a75dbc131 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -26,65 +26,8 @@
#define TOPOLOGY_REGISTER_OFFSET 0x10
-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT_XXL
-/*
- * Interrupt control on vSMPowered systems:
- * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
- * and vice versa.
- */
-
-asmlinkage __visible unsigned long vsmp_save_fl(void)
-{
- unsigned long flags = native_save_fl();
-
- if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
- flags &= ~X86_EFLAGS_IF;
- return flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
-
-__visible void vsmp_restore_fl(unsigned long flags)
-{
- if (flags & X86_EFLAGS_IF)
- flags &= ~X86_EFLAGS_AC;
- else
- flags |= X86_EFLAGS_AC;
- native_restore_fl(flags);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
-
-asmlinkage __visible void vsmp_irq_disable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
-
-asmlinkage __visible void vsmp_irq_enable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
-
-static unsigned __init vsmp_patch(u8 type, void *ibuf,
- unsigned long addr, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(irq.irq_enable):
- case PARAVIRT_PATCH(irq.irq_disable):
- case PARAVIRT_PATCH(irq.save_fl):
- case PARAVIRT_PATCH(irq.restore_fl):
- return paravirt_patch_default(type, ibuf, addr, len);
- default:
- return native_patch(type, ibuf, addr, len);
- }
-
-}
-
-static void __init set_vsmp_pv_ops(void)
+#ifdef CONFIG_PCI
+static void __init set_vsmp_ctl(void)
{
void __iomem *address;
unsigned int cap, ctl, cfg;
@@ -109,28 +52,12 @@ static void __init set_vsmp_pv_ops(void)
}
#endif
- if (cap & ctl & (1 << 4)) {
- /* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_ops.irq.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
- pv_ops.irq.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
- pv_ops.irq.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
- pv_ops.irq.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
- pv_ops.init.patch = vsmp_patch;
- ctl &= ~(1 << 4);
- }
writel(ctl, address + 4);
ctl = readl(address + 4);
pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
early_iounmap(address, 8);
}
-#else
-static void __init set_vsmp_pv_ops(void)
-{
-}
-#endif
-
-#ifdef CONFIG_PCI
static int is_vsmp = -1;
static void __init detect_vsmp_box(void)
@@ -164,11 +91,14 @@ static int is_vsmp_box(void)
{
return 0;
}
+static void __init set_vsmp_ctl(void)
+{
+}
#endif
static void __init vsmp_cap_cpus(void)
{
-#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
void __iomem *address;
unsigned int cfg, topology, node_shift, maxcpus;
@@ -221,6 +151,6 @@ void __init vsmp_init(void)
vsmp_cap_cpus();
- set_vsmp_pv_ops();
+ set_vsmp_ctl();
return;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 89db20f8cb70..c4533d05c214 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -55,7 +55,7 @@
#define PRIo64 "o"
/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
+#define apic_debug(fmt, arg...) do {} while (0)
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
@@ -576,6 +576,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
+ if (unlikely(!map)) {
+ count = -EOPNOTSUPP;
+ goto out;
+ }
+
if (min > map->max_apic_id)
goto out;
/* Bits above cluster_size are masked in the caller. */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cf5f572f2305..7c03c0f35444 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5074,9 +5074,9 @@ static bool need_remote_flush(u64 old, u64 new)
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
- const u8 *new, int *bytes)
+ int *bytes)
{
- u64 gentry;
+ u64 gentry = 0;
int r;
/*
@@ -5088,22 +5088,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
}
- switch (*bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
}
return gentry;
@@ -5207,8 +5197,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
-
/*
* No need to care whether allocation memory is successful
* or not since pte prefetch is skiped if it does not have
@@ -5217,6 +5205,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_topup_memory_caches(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e21ccc46792..cc6467b35a85 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
@@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ return svm->vmcb->control.tsc_offset;
}
static void avic_init_vmcb(struct vcpu_svm *svm)
@@ -1664,20 +1665,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
static int avic_init_access_page(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- int ret;
+ int ret = 0;
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page_done)
- return 0;
+ goto out;
- ret = x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
if (ret)
- return ret;
+ goto out;
kvm->arch.apic_access_page_done = true;
- return 0;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
@@ -2189,21 +2193,31 @@ out:
return ERR_PTR(err);
}
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
+}
+
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So do a full IBPB now.
- */
- indirect_branch_prediction_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -7149,7 +7163,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.read_l1_tsc_offset = svm_read_l1_tsc_offset,
- .write_tsc_offset = svm_write_tsc_offset,
+ .write_l1_tsc_offset = svm_write_l1_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4555077d69ce..02edd9960e9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -174,6 +174,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
@@ -984,6 +985,7 @@ struct vcpu_vmx {
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
+ bool guest_msrs_dirty;
unsigned long host_idt_base;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
@@ -1306,7 +1308,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
-static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -1610,12 +1612,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* We don't support disabling the feature for simplicity. */
- if (vmx->nested.enlightened_vmcs_enabled)
- return 0;
-
- vmx->nested.enlightened_vmcs_enabled = true;
-
/*
* vmcs_version represents the range of supported Enlightened VMCS
* versions: lower 8 bits is the minimal version, higher 8 bits is the
@@ -1625,6 +1621,12 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
if (vmcs_version)
*vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
+ /* We don't support disabling the feature for simplicity. */
+ if (vmx->nested.enlightened_vmcs_enabled)
+ return 0;
+
+ vmx->nested.enlightened_vmcs_enabled = true;
+
vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
@@ -2897,6 +2899,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->req_immediate_exit = false;
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
+ vmx->guest_msrs_dirty = false;
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+
+ }
+
if (vmx->loaded_cpu_state)
return;
@@ -2957,11 +2973,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmcs_writel(HOST_GS_BASE, gs_base);
host_state->gs_base = gs_base;
}
-
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
@@ -3436,6 +3447,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
+ vmx->guest_msrs_dirty = true;
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
@@ -3452,11 +3464,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
-/*
- * writes 'offset' into guest's timestamp counter offset register
- */
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
+ u64 active_offset = offset;
if (is_guest_mode(vcpu)) {
/*
* We're here if L1 chose not to trap WRMSR to TSC. According
@@ -3464,17 +3474,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* set for L2 remains unchanged, and still needs to be added
* to the newly set TSC to get L2's TSC.
*/
- struct vmcs12 *vmcs12;
- /* recalculate vmcs02.TSC_OFFSET: */
- vmcs12 = get_vmcs12(vcpu);
- vmcs_write64(TSC_OFFSET, offset +
- (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
- vmcs12->tsc_offset : 0));
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
+ active_offset += vmcs12->tsc_offset;
} else {
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vmcs_read64(TSC_OFFSET), offset);
- vmcs_write64(TSC_OFFSET, offset);
}
+
+ vmcs_write64(TSC_OFFSET, active_offset);
+ return active_offset;
}
/*
@@ -5944,7 +5953,7 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -5982,7 +5991,7 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
}
}
-static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -6020,7 +6029,7 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm
}
}
-static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type, bool value)
{
if (value)
@@ -8664,8 +8673,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
- vmcs12->hdr.revision_id = evmcs->revision_id;
-
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
@@ -9369,7 +9376,30 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
- if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+ /*
+ * Currently, KVM only supports eVMCS version 1
+ * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
+ * value to first u32 field of eVMCS which should specify eVMCS
+ * VersionNumber.
+ *
+ * Guest should be aware of supported eVMCS versions by host by
+ * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
+ * expected to set this CPUID leaf according to the value
+ * returned in vmcs_version from nested_enable_evmcs().
+ *
+ * However, it turns out that Microsoft Hyper-V fails to comply
+ * to their own invented interface: When Hyper-V use eVMCS, it
+ * just sets first u32 field of eVMCS to revision_id specified
+ * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
+ * which is one of the supported versions specified in
+ * CPUID.0x4000000A.EAX[0:15].
+ *
+ * To overcome Hyper-V bug, we accept here either a supported
+ * eVMCS version or VMCS12 revision_id as valid values for first
+ * u32 field of eVMCS.
+ */
+ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
+ (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu);
return 0;
}
@@ -9390,9 +9420,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
* present in struct hv_enlightened_vmcs, ...). Make sure there
* are no leftovers.
*/
- if (from_launch)
- memset(vmx->nested.cached_vmcs12, 0,
- sizeof(*vmx->nested.cached_vmcs12));
+ if (from_launch) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ memset(vmcs12, 0, sizeof(*vmcs12));
+ vmcs12->hdr.revision_id = VMCS12_REVISION;
+ }
}
return 1;
@@ -15062,7 +15094,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
- .write_tsc_offset = vmx_write_tsc_offset,
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5cd5647120f2..d02937760c3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1665,8 +1665,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- vcpu->arch.tsc_offset = offset;
+ vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -1794,7 +1793,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
- kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -6918,6 +6918,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
clock_pairing.nsec = ts.tv_nsec;
clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
ret = 0;
if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
@@ -7455,7 +7456,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
else {
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
if (is_guest_mode(vcpu))
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 0d7b3ae4960b..a5d7ed125337 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1905,7 +1905,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
init_top_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
- /* L4[272] -> level3_ident_pgt */
+ /* L4[273] -> level3_ident_pgt */
/* L4[511] -> level3_kernel_pgt */
convert_pfn_mfn(init_top_pgt);
@@ -1925,8 +1925,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
addr[0] = (unsigned long)pgd;
addr[1] = (unsigned long)l3;
addr[2] = (unsigned long)l2;
- /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
- * Both L4[272][0] and L4[511][510] have entries that point to the same
+ /* Graft it onto L4[273][0]. Note that we creating an aliasing problem:
+ * Both L4[273][0] and L4[511][510] have entries that point to the same
* L2 (PMD) tables. Meaning that if you modify it in __va space
* it will be also modified in the __ka space! (But if you just
* modify the PMD table to point to other PTE's or none, then you
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b06731705529..055e37e43541 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -656,8 +656,7 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
/*
* The interface requires atomic updates on p2m elements.
- * xen_safe_write_ulong() is using __put_user which does an atomic
- * store via asm().
+ * xen_safe_write_ulong() is using an atomic store via asm().
*/
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 441c88262169..1c8a8816a402 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -9,6 +9,7 @@
#include <linux/log2.h>
#include <linux/gfp.h>
#include <linux/slab.h>
+#include <linux/atomic.h>
#include <asm/paravirt.h>
#include <asm/qspinlock.h>
@@ -21,6 +22,7 @@
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest);
static bool xen_pvspin = true;
static void xen_qlock_kick(int cpu)
@@ -39,25 +41,25 @@ static void xen_qlock_kick(int cpu)
*/
static void xen_qlock_wait(u8 *byte, u8 val)
{
- unsigned long flags;
int irq = __this_cpu_read(lock_kicker_irq);
+ atomic_t *nest_cnt = this_cpu_ptr(&xen_qlock_wait_nest);
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1 || in_nmi())
return;
- /* Guard against reentry. */
- local_irq_save(flags);
+ /* Detect reentry. */
+ atomic_inc(nest_cnt);
- /* If irq pending already clear it. */
- if (xen_test_irq_pending(irq)) {
+ /* If irq pending already and no nested call clear it. */
+ if (atomic_read(nest_cnt) == 1 && xen_test_irq_pending(irq)) {
xen_clear_irq_pending(irq);
} else if (READ_ONCE(*byte) == val) {
/* Block until irq becomes pending (or a spurious wakeup) */
xen_poll_irq(irq);
}
- local_irq_restore(flags);
+ atomic_dec(nest_cnt);
}
static irqreturn_t dummy_handler(int irq, void *dev_id)
OpenPOWER on IntegriCloud