summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/header.S8
-rw-r--r--arch/x86/boot/string.c9
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S67
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
-rw-r--r--arch/x86/entry/entry_64.S12
-rw-r--r--arch/x86/events/amd/uncore.c21
-rw-r--r--arch/x86/events/core.c80
-rw-r--r--arch/x86/events/intel/bts.c4
-rw-r--r--arch/x86/events/intel/core.c107
-rw-r--r--arch/x86/events/intel/ds.c58
-rw-r--r--arch/x86/events/intel/lbr.c52
-rw-r--r--arch/x86/events/intel/p4.c2
-rw-r--r--arch/x86/events/intel/pt.c5
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c12
-rw-r--r--arch/x86/events/intel/uncore_snb.c6
-rw-r--r--arch/x86/events/intel/uncore_snbep.c93
-rw-r--r--arch/x86/events/perf_event.h10
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/elf.h4
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h6
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/hypervisor.h10
-rw-r--r--arch/x86/include/asm/irq_vectors.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h4
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c43
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c18
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/hpet.c27
-rw-r--r--arch/x86/kernel/irq.c19
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c10
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/ksysfs.c4
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/smpboot.c30
-rw-r--r--arch/x86/kernel/unwind_frame.c2
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/svm.c19
-rw-r--r--arch/x86/kvm/vmx.c287
-rw-r--r--arch/x86/kvm/x86.c52
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/mmap.c7
-rw-r--r--arch/x86/platform/uv/tlb_uv.c4
-rw-r--r--arch/x86/um/user-offsets.c2
-rw-r--r--arch/x86/xen/enlighten_hvm.c59
60 files changed, 841 insertions, 406 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..323cb065be5e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,6 +100,7 @@ config X86
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+ select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
@@ -163,7 +164,7 @@ config X86
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI
- select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI
+ select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 2c860ad4fe06..8a958274b54c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -34,6 +34,7 @@ KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a0838ab929f2..c14217cd0155 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,8 +116,7 @@ void __putstr(const char *s)
}
}
- if (boot_params->screen_info.orig_video_mode == 0 &&
- lines == 0 && cols == 0)
+ if (lines == 0 || cols == 0)
return;
x = boot_params->screen_info.orig_x;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2ed8f0c25def..1bb08ecffd24 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
# the description in lib/decompressor_xxx.c for specific information.
#
# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
-#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128)
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536)
#if ZO_z_output_len > ZO_z_input_len
# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
ZO_z_input_len)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 630e3664906b..16f49123d747 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -16,6 +16,15 @@
#include "ctype.h"
#include "string.h"
+/*
+ * Undef these macros so that the functions that we provide
+ * here will have the correct names regardless of how string.h
+ * may have chosen to #define them.
+ */
+#undef memcpy
+#undef memset
+#undef memcmp
+
int memcmp(const void *s1, const void *s2, size_t len)
{
bool diff;
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792db15ef..1eab79c9ac48 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@
.set T1, REG_T1
.endm
-#define K_BASE %r8
#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
#define BUFFER_PTR %r10
#define BUFFER_PTR2 %r13
-#define BUFFER_END %r11
#define PRECALC_BUF %r14
#define WK_BUF %r15
@@ -205,14 +204,14 @@
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
- vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
.elseif ((i & 7) == 1)
- vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
.elseif ((i & 7) == 4)
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
@@ -255,7 +254,7 @@
vpxor WY, WY_TMP, WY_TMP
.elseif ((i & 7) == 7)
vpxor WY_TMP2, WY_TMP, WY
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@
vpsrld $30, WY, WY
vpor WY, WY_TMP, WY
.elseif ((i & 7) == 7)
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@
.endm
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
/*
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
*/
@@ -463,13 +472,16 @@
lea (2*4*80+32)(%rsp), WK_BUF
# Precalc WK for first 2 blocks
- PRECALC_OFFSET = 0
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
PRECALC i
.set i, i + 1
.endr
- PRECALC_OFFSET = 128
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.align 32
@@ -479,8 +491,8 @@ _loop:
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
- cmp K_BASE, BUFFER_PTR
- jne _begin
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz _begin
.align 32
jmp _end
.align 32
@@ -512,10 +524,10 @@ _loop0:
.set j, j+2
.endr
- add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
- cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
-
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
/*
* rounds
* 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
- cmp K_BASE, BUFFER_PTR /* is current block the last one? */
- je _loop
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz _loop
mov TB, B
@@ -575,10 +587,10 @@ _loop2:
.set j, j+2
.endr
- add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
-
- cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp _loop3
_loop3:
@@ -641,19 +653,12 @@ _loop3:
avx2_zeroupper
- lea K_XMM_AR(%rip), K_BASE
-
+ /* Setup initial values */
mov CTX, HASH_PTR
mov BUF, BUFFER_PTR
- lea 64(BUF), BUFFER_PTR2
-
- shl $6, CNT /* mul by 64 */
- add BUF, CNT
- add $64, CNT
- mov CNT, BUFFER_END
- cmp BUFFER_END, BUFFER_PTR2
- cmovae K_BASE, BUFFER_PTR2
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f960a043cdeb..fc61739150e7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
static bool avx2_usable(void)
{
- if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
&& boot_cpu_has(X86_FEATURE_BMI1)
&& boot_cpu_has(X86_FEATURE_BMI2))
return true;
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a9a8027a6c0e..ce8dc33dd640 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -675,13 +675,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
#endif
/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY .popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
.macro apicinterrupt num sym do_sym
PUSH_SECTION_IRQENTRY
@@ -705,6 +700,7 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_HAVE_KVM
apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
+apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
@@ -1210,6 +1206,8 @@ ENTRY(nmi)
* other IST entries.
*/
+ ASM_CLAC
+
/* Use %rdx as our temp variable throughout */
pushq %rdx
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index ad44af0dd667..f5cbbba99283 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
if (amd_uncore_llc) {
unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared;
+ unsigned int nshared, subleaf, prev_eax = 0;
uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
- nshared = ((eax >> 14) & 0xfff) + 1;
+ /*
+ * Iterate over Cache Topology Definition leaves until no
+ * more cache descriptions are available.
+ */
+ for (subleaf = 0; subleaf < 5; subleaf++) {
+ cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
+
+ /* EAX[0:4] gives type of cache */
+ if (!(eax & 0x1f))
+ break;
+
+ prev_eax = eax;
+ }
+ nshared = ((prev_eax >> 14) & 0xfff) + 1;
+
uncore->id = apicid - (apicid % nshared);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
@@ -555,7 +568,7 @@ static int __init amd_uncore_init(void)
ret = 0;
}
- if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
amd_uncore_llc = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_llc) {
ret = -ENOMEM;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8e3db8f642a7..80534d3c2480 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event)
return m == b;
}
-int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_max_precise(void)
{
- if (event->attr.precise_ip) {
- int precise = 0;
+ int precise = 0;
- /* Support for constant skid */
- if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ /* Support for constant skid */
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
- precise++;
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
+ }
+ return precise;
+}
- if (x86_pmu.pebs_prec_dist)
- precise++;
- }
+int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = x86_pmu_max_precise();
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
}
static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
static int __init init_hw_perf_events(void)
{
@@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+ if (x86_pmu.caps_attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_caps_group.attrs = tmp;
+ }
+
if (x86_pmu.event_attrs)
x86_pmu_events_group.attrs = x86_pmu.event_attrs;
@@ -2114,7 +2129,7 @@ static void refresh_pce(void *ignored)
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
-static void x86_pmu_event_mapped(struct perf_event *event)
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
@@ -2129,22 +2144,20 @@ static void x86_pmu_event_mapped(struct perf_event *event)
* For now, this can't happen because all callers hold mmap_sem
* for write. If this changes, we'll need a different solution.
*/
- lockdep_assert_held_exclusive(&current->mm->mmap_sem);
+ lockdep_assert_held_exclusive(&mm->mmap_sem);
- if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
- on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+ if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
-static void x86_pmu_event_unmapped(struct perf_event *event)
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!current->mm)
- return;
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
- if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
- on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+ if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
static int x86_pmu_event_idx(struct perf_event *event)
@@ -2215,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = {
.attrs = x86_pmu_attrs,
};
+static ssize_t max_precise_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+ &dev_attr_max_precise.attr,
+ NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+ .name = "caps",
+ .attrs = x86_pmu_caps_attrs,
+};
+
static const struct attribute_group *x86_pmu_attr_groups[] = {
&x86_pmu_attr_group,
&x86_pmu_format_group,
&x86_pmu_events_group,
+ &x86_pmu_caps_group,
NULL,
};
@@ -2337,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
- if (idx > LDT_ENTRIES)
- return 0;
-
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->nr_entries)
+ if (!ldt || idx >= ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
@@ -2350,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;
#endif
} else {
- if (idx > GDT_ENTRIES)
+ if (idx >= GDT_ENTRIES)
return 0;
desc = raw_cpu_ptr(gdt_page.gdt) + idx;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 8ae8c5ce3a1f..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -69,7 +69,7 @@ struct bts_buffer {
struct bts_phys buf[0];
};
-struct pmu bts_pmu;
+static struct pmu bts_pmu;
static size_t buf_size(struct page *page)
{
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
- event->hw.itrace_started = 1;
+ perf_event_itrace_started(event);
event->hw.state = 0;
__bts_event_start(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 98b0f0729527..829e89cfcee2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_any.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
+ NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
&format_attr_in_tx.attr,
&format_attr_in_tx_cp.attr,
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
- &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
- &format_attr_ldlat.attr, /* PEBS load latency */
- NULL,
+static struct attribute *nhm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ NULL
};
static struct attribute *skl_format_attr[] = {
@@ -3781,6 +3795,36 @@ done:
static DEVICE_ATTR_RW(freeze_on_smi);
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+ &dev_attr_pmu_name.attr,
+ NULL
+};
+
static struct attribute *intel_pmu_attrs[] = {
&dev_attr_freeze_on_smi.attr,
NULL,
@@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void)
unsigned int unused;
struct extra_reg *er;
int version, i;
+ struct attribute **extra_attr = NULL;
+ char *name;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void)
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_CORE_YONAH:
pr_cont("Core events, ");
+ name = "core";
break;
case INTEL_FAM6_CORE2_MEROM:
@@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_core2_event_constraints;
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
pr_cont("Core2 events, ");
+ name = "core2";
break;
case INTEL_FAM6_NEHALEM:
@@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void)
intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;
+ extra_attr = nhm_format_attr;
pr_cont("Nehalem events, ");
+ name = "nehalem";
break;
case INTEL_FAM6_ATOM_PINEVIEW:
@@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
+ name = "bonnell";
break;
case INTEL_FAM6_ATOM_SILVERMONT1:
@@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_slm_extra_regs;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = slm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Silvermont events, ");
+ name = "silvermont";
break;
case INTEL_FAM6_ATOM_GOLDMONT:
@@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = glm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Goldmont events, ");
+ name = "goldmont";
break;
case INTEL_FAM6_ATOM_GEMINI_LAKE:
@@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = glm_events_attrs;
/* Goldmont Plus has 4-wide pipeline */
event_attr_td_total_slots_scale_glm.event_str = "4";
+ extra_attr = slm_format_attr;
pr_cont("Goldmont plus events, ");
+ name = "goldmont_plus";
break;
case INTEL_FAM6_WESTMERE:
@@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void)
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
intel_pmu_pebs_data_source_nhm();
+ extra_attr = nhm_format_attr;
pr_cont("Westmere events, ");
+ name = "westmere";
break;
case INTEL_FAM6_SANDYBRIDGE:
@@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("SandyBridge events, ");
+ name = "sandybridge";
break;
case INTEL_FAM6_IVYBRIDGE:
@@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("IvyBridge events, ");
+ name = "ivybridge";
break;
@@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.lbr_double_abort = true;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Haswell events, ");
+ name = "haswell";
break;
case INTEL_FAM6_BROADWELL_CORE:
@@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.limit_period = bdw_limit_period;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Broadwell events, ");
+ name = "broadwell";
break;
case INTEL_FAM6_XEON_PHI_KNL:
@@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void)
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
+ extra_attr = slm_format_attr;
pr_cont("Knights Landing/Mill events, ");
+ name = "knights-landing";
break;
case INTEL_FAM6_SKYLAKE_MOBILE:
@@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
- skl_format_attr);
- WARN_ON(!x86_pmu.format_attrs);
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_attr = merge_attr(extra_attr, skl_format_attr);
x86_pmu.cpu_events = hsw_events_attrs;
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
+ name = "skylake";
break;
default:
@@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void)
case 1:
x86_pmu.event_constraints = intel_v1_event_constraints;
pr_cont("generic architected perfmon v1, ");
+ name = "generic_arch_v1";
break;
default:
/*
@@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void)
*/
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v2+";
break;
}
}
+ snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+ if (version >= 2 && extra_attr) {
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ extra_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ }
+
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
- if (x86_pmu.lbr_nr)
+ x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+ if (x86_pmu.lbr_nr) {
+ x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+ }
+
/*
* Access extra MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support offcore event
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed5f8ed..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
*/
#define P(a, b) PERF_MEM_S(a, b)
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
static u64 pebs_data_source[] = {
- P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
- OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
- OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
- OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
- OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
- OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
- OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
- OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
- OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
- OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
- OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
- OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
};
/* Patch up minor differences in the bits */
void __init intel_pmu_pebs_data_source_nhm(void)
{
- pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
- pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
- pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
}
static u64 precise_store_data(u64 status)
@@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
- int model = boot_cpu_data.x86_model;
- int fam = boot_cpu_data.x86;
dse.val = status;
@@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status)
/*
* Nehalem models do not support TLB, Lock infos
*/
- if (fam == 0x6 && (model == 26 || model == 30
- || model == 31 || model == 46)) {
+ if (x86_pmu.pebs_no_tlb) {
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
@@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
else
regs->flags &= ~PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_ADDR) &&
+ if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 955457a30197..8a6bbacd17dc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
X86_BR_CALL_STACK = 1 << 16,/* call stack */
X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].in_tx = 0;
cpuc->lbr_entries[i].abort = 0;
cpuc->lbr_entries[i].cycles = 0;
+ cpuc->lbr_entries[i].type = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
@@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
cpuc->lbr_entries[out].cycles = cycles;
+ cpuc->lbr_entries[out].type = 0;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
@@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_CALL)
mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
return ret;
}
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_UNKNOWN, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_UNKNOWN, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_UNKNOWN, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
+
/*
* implement actual branch filter based on user demand.
* Hardware may not exactly satisfy that request, thus
@@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
bool compress = false;
/* if sampling all branches, then nothing to filter */
- if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].from = 0;
compress = true;
}
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
}
if (!compress)
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index eb0533558c2b..d32c0eed38ca 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids
* P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
* either up to date automatically or not applicable at all.
*/
-struct p4_event_alias {
+static struct p4_event_alias {
u64 original;
u64 alternative;
} p4_event_aliases[] = {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
- if (!event->hw.itrace_started) {
- event->hw.itrace_started = 1;
+ /* First round: clear STATUS, in particular the PSB byte counter. */
+ if (!event->hw.config) {
+ perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index a45e2114a846..8e2457cb6b4a 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = {
.attrs = rapl_formats_attr,
};
-const struct attribute_group *rapl_attr_groups[] = {
+static const struct attribute_group *rapl_attr_groups[] = {
&rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 44ec523287f6..1c5390f1cf09 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = {
NULL,
};
-static struct attribute_group uncore_pmu_attr_group = {
+static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index cda569332005..6a5cbe90f859 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -272,7 +272,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_ubox_format_group = {
+static const struct attribute_group nhmex_uncore_ubox_format_group = {
.name = "format",
.attrs = nhmex_uncore_ubox_formats_attr,
};
@@ -299,7 +299,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_cbox_format_group = {
+static const struct attribute_group nhmex_uncore_cbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_cbox_formats_attr,
};
@@ -407,7 +407,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_bbox_format_group = {
+static const struct attribute_group nhmex_uncore_bbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_bbox_formats_attr,
};
@@ -484,7 +484,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_sbox_format_group = {
+static const struct attribute_group nhmex_uncore_sbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_sbox_formats_attr,
};
@@ -898,7 +898,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_mbox_format_group = {
+static const struct attribute_group nhmex_uncore_mbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_mbox_formats_attr,
};
@@ -1163,7 +1163,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_rbox_format_group = {
+static const struct attribute_group nhmex_uncore_rbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_rbox_formats_attr,
};
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a3dcc12bef4a..db1127ce685e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -130,7 +130,7 @@ static struct attribute *snb_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group snb_uncore_format_group = {
+static const struct attribute_group snb_uncore_format_group = {
.name = "format",
.attrs = snb_uncore_formats_attr,
};
@@ -289,7 +289,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = {
NULL,
};
-static struct attribute_group snb_uncore_imc_format_group = {
+static const struct attribute_group snb_uncore_imc_format_group = {
.name = "format",
.attrs = snb_uncore_imc_formats_attr,
};
@@ -769,7 +769,7 @@ static struct attribute *nhm_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group nhm_uncore_format_group = {
+static const struct attribute_group nhm_uncore_format_group = {
.name = "format",
.attrs = nhm_uncore_formats_attr,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index dae2fedc1601..db1fe377e6dd 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -316,7 +316,7 @@
#define SKX_UPI_PCI_PMON_CTL0 0x350
#define SKX_UPI_PCI_PMON_CTR0 0x318
#define SKX_UPI_PCI_PMON_BOX_CTL 0x378
-#define SKX_PMON_CTL_UMASK_EXT 0xff
+#define SKX_UPI_CTL_UMASK_EXT 0xffefff
/* SKX M2M */
#define SKX_M2M_PCI_PMON_CTL0 0x228
@@ -328,7 +328,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
@@ -351,7 +351,6 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
@@ -603,27 +602,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = {
{ /* end: all zeroes */ },
};
-static struct attribute_group snbep_uncore_format_group = {
+static const struct attribute_group snbep_uncore_format_group = {
.name = "format",
.attrs = snbep_uncore_formats_attr,
};
-static struct attribute_group snbep_uncore_ubox_format_group = {
+static const struct attribute_group snbep_uncore_ubox_format_group = {
.name = "format",
.attrs = snbep_uncore_ubox_formats_attr,
};
-static struct attribute_group snbep_uncore_cbox_format_group = {
+static const struct attribute_group snbep_uncore_cbox_format_group = {
.name = "format",
.attrs = snbep_uncore_cbox_formats_attr,
};
-static struct attribute_group snbep_uncore_pcu_format_group = {
+static const struct attribute_group snbep_uncore_pcu_format_group = {
.name = "format",
.attrs = snbep_uncore_pcu_formats_attr,
};
-static struct attribute_group snbep_uncore_qpi_format_group = {
+static const struct attribute_group snbep_uncore_qpi_format_group = {
.name = "format",
.attrs = snbep_uncore_qpi_formats_attr,
};
@@ -1432,27 +1431,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
NULL,
};
-static struct attribute_group ivbep_uncore_format_group = {
+static const struct attribute_group ivbep_uncore_format_group = {
.name = "format",
.attrs = ivbep_uncore_formats_attr,
};
-static struct attribute_group ivbep_uncore_ubox_format_group = {
+static const struct attribute_group ivbep_uncore_ubox_format_group = {
.name = "format",
.attrs = ivbep_uncore_ubox_formats_attr,
};
-static struct attribute_group ivbep_uncore_cbox_format_group = {
+static const struct attribute_group ivbep_uncore_cbox_format_group = {
.name = "format",
.attrs = ivbep_uncore_cbox_formats_attr,
};
-static struct attribute_group ivbep_uncore_pcu_format_group = {
+static const struct attribute_group ivbep_uncore_pcu_format_group = {
.name = "format",
.attrs = ivbep_uncore_pcu_formats_attr,
};
-static struct attribute_group ivbep_uncore_qpi_format_group = {
+static const struct attribute_group ivbep_uncore_qpi_format_group = {
.name = "format",
.attrs = ivbep_uncore_qpi_formats_attr,
};
@@ -1888,7 +1887,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_ubox_format_group = {
+static const struct attribute_group knl_uncore_ubox_format_group = {
.name = "format",
.attrs = knl_uncore_ubox_formats_attr,
};
@@ -1928,7 +1927,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_cha_format_group = {
+static const struct attribute_group knl_uncore_cha_format_group = {
.name = "format",
.attrs = knl_uncore_cha_formats_attr,
};
@@ -2038,7 +2037,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_pcu_format_group = {
+static const struct attribute_group knl_uncore_pcu_format_group = {
.name = "format",
.attrs = knl_uncore_pcu_formats_attr,
};
@@ -2188,7 +2187,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_irp_format_group = {
+static const struct attribute_group knl_uncore_irp_format_group = {
.name = "format",
.attrs = knl_uncore_irp_formats_attr,
};
@@ -2386,7 +2385,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_ubox_format_group = {
+static const struct attribute_group hswep_uncore_ubox_format_group = {
.name = "format",
.attrs = hswep_uncore_ubox_formats_attr,
};
@@ -2440,7 +2439,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_cbox_format_group = {
+static const struct attribute_group hswep_uncore_cbox_format_group = {
.name = "format",
.attrs = hswep_uncore_cbox_formats_attr,
};
@@ -2622,7 +2621,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_sbox_format_group = {
+static const struct attribute_group hswep_uncore_sbox_format_group = {
.name = "format",
.attrs = hswep_uncore_sbox_formats_attr,
};
@@ -3302,7 +3301,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = {
&format_attr_inv.attr,
&format_attr_thresh8.attr,
&format_attr_filter_tid4.attr,
- &format_attr_filter_link4.attr,
&format_attr_filter_state5.attr,
&format_attr_filter_rem.attr,
&format_attr_filter_loc.attr,
@@ -3312,12 +3310,11 @@ static struct attribute *skx_uncore_cha_formats_attr[] = {
&format_attr_filter_opc_0.attr,
&format_attr_filter_opc_1.attr,
&format_attr_filter_nc.attr,
- &format_attr_filter_c6.attr,
&format_attr_filter_isoc.attr,
NULL,
};
-static struct attribute_group skx_uncore_chabox_format_group = {
+static const struct attribute_group skx_uncore_chabox_format_group = {
.name = "format",
.attrs = skx_uncore_cha_formats_attr,
};
@@ -3333,8 +3330,11 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8),
+ EVENT_EXTRA_END
};
static u64 skx_cha_filter_mask(int fields)
@@ -3347,6 +3347,17 @@ static u64 skx_cha_filter_mask(int fields)
mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
if (fields & 0x4)
mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8) {
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC;
+ }
return mask;
}
@@ -3416,7 +3427,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_uncore_iio_format_group = {
+static const struct attribute_group skx_uncore_iio_format_group = {
.name = "format",
.attrs = skx_uncore_iio_formats_attr,
};
@@ -3473,7 +3484,7 @@ static struct attribute *skx_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_uncore_format_group = {
+static const struct attribute_group skx_uncore_format_group = {
.name = "format",
.attrs = skx_uncore_formats_attr,
};
@@ -3492,6 +3503,26 @@ static struct intel_uncore_type skx_uncore_irp = {
.format_group = &skx_uncore_format_group,
};
+static struct attribute *skx_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_pcu_formats_attr,
+};
+
static struct intel_uncore_ops skx_uncore_pcu_ops = {
IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
.hw_config = hswep_pcu_hw_config,
@@ -3510,7 +3541,7 @@ static struct intel_uncore_type skx_uncore_pcu = {
.box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
.num_shared_regs = 1,
.ops = &skx_uncore_pcu_ops,
- .format_group = &snbep_uncore_pcu_format_group,
+ .format_group = &skx_uncore_pcu_format_group,
};
static struct intel_uncore_type *skx_msr_uncores[] = {
@@ -3574,7 +3605,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_upi_uncore_format_group = {
+static const struct attribute_group skx_upi_uncore_format_group = {
.name = "format",
.attrs = skx_upi_uncore_formats_attr,
};
@@ -3603,8 +3634,8 @@ static struct intel_uncore_type skx_uncore_upi = {
.perf_ctr_bits = 48,
.perf_ctr = SKX_UPI_PCI_PMON_CTR0,
.event_ctl = SKX_UPI_PCI_PMON_CTL0,
- .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
- .event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_UPI_CTL_UMASK_EXT,
.box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &skx_upi_uncore_format_group,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3a4cab..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
- PERF_SAMPLE_TRANSACTION)
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
/*
* A debug store configuration.
@@ -558,6 +558,7 @@ struct x86_pmu {
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
+ struct attribute **caps_attrs;
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
@@ -591,7 +592,8 @@ struct x86_pmu {
pebs :1,
pebs_active :1,
pebs_broken :1,
- pebs_prec_dist :1;
+ pebs_prec_dist :1,
+ pebs_no_tlb :1;
int pebs_record_size;
int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
@@ -741,6 +743,8 @@ int x86_reserve_hardware(void);
void x86_release_hardware(void);
+int x86_pmu_max_precise(void);
+
void hw_perf_lbr_event_destroy(struct perf_event *event);
int x86_setup_perfctr(struct perf_event *event);
@@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void);
void intel_pmu_pebs_data_source_nhm(void);
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ca3c48c0872f..8ea315a11fe0 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
/*
@@ -286,7 +286,7 @@
#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..9aeb91935ce0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -247,11 +247,11 @@ extern int force_personality32;
/*
* This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
* space open for things that want to use the area for 32-bit pointers.
*/
#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
- 0x100000000UL)
+ (TASK_SIZE / 3 * 2))
/* This yields a mask that user programs can use to figure out what
instruction set this CPU supports. This could be done in user space,
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index df002992d8fd..07b06955a05d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -25,6 +25,8 @@ BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
smp_kvm_posted_intr_ipi)
BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
smp_kvm_posted_intr_wakeup_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR,
+ smp_kvm_posted_intr_nested_ipi)
#endif
/*
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..554cdb205d17 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
return 0;
}
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
if (use_xsave()) {
- copy_kernel_to_xregs(&fpstate->xsave, -1);
+ copy_kernel_to_xregs(&fpstate->xsave, mask);
} else {
if (use_fxsr())
copy_kernel_to_fxregs(&fpstate->fxsave);
@@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
: : [addr] "m" (fpstate));
}
- __copy_kernel_to_fpregs(fpstate);
+ __copy_kernel_to_fpregs(fpstate, -1);
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 9b76cd331990..ad1ed531febc 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,6 +15,7 @@ typedef struct {
#ifdef CONFIG_HAVE_KVM
unsigned int kvm_posted_intr_ipis;
unsigned int kvm_posted_intr_wakeup_ipis;
+ unsigned int kvm_posted_intr_nested_ipis;
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b90e1053049b..d6dbafbd4207 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern asmlinkage void apic_timer_interrupt(void);
extern asmlinkage void x86_platform_ipi(void);
extern asmlinkage void kvm_posted_intr_ipi(void);
extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
+extern asmlinkage void kvm_posted_intr_nested_ipi(void);
extern asmlinkage void error_interrupt(void);
extern asmlinkage void irq_work_interrupt(void);
@@ -62,6 +63,7 @@ extern void trace_call_function_single_interrupt(void);
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
+#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi
#endif /* CONFIG_TRACING */
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 21126155a739..0ead9dbb9130 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
/* pin current vcpu to specified physical cpu (run rarely) */
void (*pin_vcpu)(int);
+
+ /* called during init_mem_mapping() to setup early mappings. */
+ void (*init_mem_mapping)(void);
};
extern const struct hypervisor_x86 *x86_hyper;
@@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
extern void init_hypervisor_platform(void);
extern bool hypervisor_x2apic_available(void);
extern void hypervisor_pin_vcpu(int cpu);
+
+static inline void hypervisor_init_mem_mapping(void)
+{
+ if (x86_hyper && x86_hyper->init_mem_mapping)
+ x86_hyper->init_mem_mapping();
+}
#else
static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_x2apic_available(void) { return false; }
+static inline void hypervisor_init_mem_mapping(void) { }
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6ca9fd6234e1..aaf8d28b5d00 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -83,7 +83,6 @@
*/
#define X86_PLATFORM_IPI_VECTOR 0xf7
-#define POSTED_INTR_WAKEUP_VECTOR 0xf1
/*
* IRQ work vector:
*/
@@ -98,6 +97,8 @@
/* Vector for KVM to deliver posted interrupt IPI */
#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR 0xf1
+#define POSTED_INTR_NESTED_VECTOR 0xf0
#endif
/*
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fba6d8e..92c9032502d8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -492,6 +492,7 @@ struct kvm_vcpu_arch {
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
+ u32 pkru;
u32 hflags;
u64 efer;
u64 apic_base;
@@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 265c907d7d4c..7a234be7e298 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -140,9 +140,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
- init_new_context_ldt(tsk, mm);
-
- return 0;
+ return init_new_context_ldt(tsk, mm);
}
static inline void destroy_context(struct mm_struct *mm)
{
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index d869c8671e36..0ee83321a313 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -8,20 +8,25 @@
* This file is licensed under GPLv2.
*/
-#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/smp.h>
struct aperfmperf_sample {
unsigned int khz;
- unsigned long jiffies;
+ ktime_t time;
u64 aperf;
u64 mperf;
};
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+#define APERFMPERF_CACHE_THRESHOLD_MS 10
+#define APERFMPERF_REFRESH_DELAY_MS 20
+#define APERFMPERF_STALE_THRESHOLD_MS 1000
+
/*
* aperfmperf_snapshot_khz()
* On the current CPU, snapshot APERF, MPERF, and jiffies
@@ -33,13 +38,18 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+ ktime_t now = ktime_get();
+ s64 time_delta = ktime_ms_delta(now, s->time);
+ unsigned long flags;
- /* Don't bother re-computing within 10 ms */
- if (time_before(jiffies, s->jiffies + HZ/100))
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return;
+ local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
aperf_delta = aperf - s->aperf;
mperf_delta = mperf - s->mperf;
@@ -51,22 +61,21 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
- /*
- * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
- * khz = (cpu_khz * aperf_delta) / mperf_delta
- */
- if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
- s->khz = div64_u64(aperf_delta,
- div64_u64(mperf_delta, cpu_khz));
- s->jiffies = jiffies;
+ s->time = now;
s->aperf = aperf;
s->mperf = mperf;
+
+ /* If the previous iteration was too long ago, discard it. */
+ if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
+ s->khz = 0;
+ else
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ unsigned int khz;
+
if (!cpu_khz)
return 0;
@@ -74,6 +83,12 @@ unsigned int arch_freq_get_on_cpu(int cpu)
return 0;
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ khz = per_cpu(samples.khz, cpu);
+ if (khz)
+ return khz;
+
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
return per_cpu(samples.khz, cpu);
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d7cc190ae457..f7370abd33c6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = {
NULL
};
-static struct attribute_group thermal_attr_group = {
+static const struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9cb98ee103db..86e8f0b2537b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = {
NULL
};
-static struct attribute_group mc_attr_group = {
+static const struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
@@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = {
NULL
};
-static struct attribute_group cpu_root_microcode_group = {
+static const struct attribute_group cpu_root_microcode_group = {
.name = "microcode",
.attrs = cpu_root_microcode_attrs,
};
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c5bb63be4ba1..40d5a8a75212 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
@@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
- set_mtrr(i, base, size, type);
+ set_mtrr_cpuslocked(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
@@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
- set_mtrr(replace, 0, 0, 0);
+ set_mtrr_cpuslocked(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
@@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
goto out;
}
if (--mtrr_usage_table[reg] < 1)
- set_mtrr(reg, 0, 0, 0);
+ set_mtrr_cpuslocked(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..a4516ca4c4f3 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
};
static void __init
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..9ba79543d9ee 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -53,6 +53,7 @@ void __head __startup_64(unsigned long physaddr)
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
int i;
+ unsigned int *next_pgt_ptr;
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -91,9 +92,9 @@ void __head __startup_64(unsigned long physaddr)
* creates a bunch of nonsense entries but that is fine --
* it avoids problems around wraparound.
*/
-
- pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+ pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 16f82a3aaec7..8ce4212e2b8d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer)
return 0;
}
-static int hpet_resume(struct clock_event_device *evt, int timer)
-{
- if (!timer) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
- irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_hardirq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
+static int hpet_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
hpet_print_config();
-
return 0;
}
@@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt)
static int hpet_legacy_resume(struct clock_event_device *evt)
{
- return hpet_resume(evt, 0);
+ return hpet_resume(evt);
}
static int hpet_legacy_next_event(unsigned long delta,
@@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt)
static int hpet_msi_resume(struct clock_event_device *evt)
{
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct msi_msg msg;
- return hpet_resume(evt, hdev->num);
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hdev, &msg);
+ hpet_msi_unmask(data);
+ return 0;
}
static int hpet_msi_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4aa03c5a14c9..4ed0aba8dbc8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -155,6 +155,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
seq_puts(p, " Posted-interrupt notification event\n");
+ seq_printf(p, "%*s: ", prec, "NPI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
+
seq_printf(p, "%*s: ", prec, "PIW");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
@@ -313,6 +319,19 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
exiting_irq();
set_irq_regs(old_regs);
}
+
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
#endif
__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7468c6987547..c7fd18526c3e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -150,6 +150,8 @@ static void __init apic_intr_init(void)
alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
/* IPI for KVM to deliver interrupt to wake up tasks */
alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
+ /* IPI for KVM to deliver nested posted interrupt */
+ alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi);
#endif
/* IPI vectors for APIC spurious and error interrupts */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6b877807598b..f0153714ddac 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p)
int arch_prepare_kprobe(struct kprobe *p)
{
+ int ret;
+
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!p->ainsn.insn)
return -ENOMEM;
- return arch_copy_kprobe(p);
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
}
void arch_arm_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
+#include <asm/sections.h>
#include "common.h"
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
/*
* Do not optimize in the entry code due to the unstable
- * stack handling.
+ * stack handling and registers setup.
*/
- if ((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end))
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4afc67f5facc..06e1ff5562c0 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = {
NULL,
};
-static struct attribute_group boot_params_attr_group = {
+static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
.bin_attrs = boot_params_data_attrs,
};
@@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = {
NULL,
};
-static struct attribute_group setup_data_attr_group = {
+static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
.bin_attrs = setup_data_data_attrs,
};
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 71c17a5be983..d04e30e3c0ff 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token)
if (hlist_unhashed(&n.link))
break;
+ rcu_irq_exit();
+
if (!n.halted) {
local_irq_enable();
schedule();
@@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token)
/*
* We cannot reschedule. So halt.
*/
- rcu_irq_exit();
native_safe_halt();
local_irq_disable();
- rcu_irq_enter();
}
+
+ rcu_irq_enter();
}
if (!n.halted)
finish_swait(&n.wq, &wait);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 67393fc88353..a56bf6051f4e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -471,12 +471,12 @@ static int __init reboot_init(void)
/*
* The DMI quirks table takes precedence. If no quirks entry
- * matches and the ACPI Hardware Reduced bit is set, force EFI
- * reboot.
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
*/
rv = dmi_check_system(reboot_dmi_table);
- if (!rv && efi_reboot_required())
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
reboot_type = BOOT_EFI;
return 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b474c8de7fba..54b9e89d4d6b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+ int *cpu0_nmi_registered)
{
volatile u32 *trampoline_status =
(volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long start_ip = real_mode_header->trampoline_start;
unsigned long boot_error = 0;
- int cpu0_nmi_registered = 0;
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
@@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
- &cpu0_nmi_registered);
+ cpu0_nmi_registered);
if (!boot_error) {
/*
@@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
*/
smpboot_restore_warm_reset_vector();
}
- /*
- * Clean up the nmi handler. Do this after the callin and callout sync
- * to avoid impact of possible long unregister time.
- */
- if (cpu0_nmi_registered)
- unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
return boot_error;
}
@@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
+ int cpu0_nmi_registered = 0;
unsigned long flags;
- int err;
+ int err, ret = 0;
WARN_ON(irqs_disabled());
@@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- err = do_boot_cpu(apicid, cpu, tidle);
+ err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
if (err) {
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- return -EIO;
+ ret = -EIO;
+ goto unreg_nmi;
}
/*
@@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- return 0;
+unreg_nmi:
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+ return ret;
}
/**
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..c29e5bc7e9c9 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -91,10 +91,8 @@ static bool in_entry_code(unsigned long ip)
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
-#endif
return false;
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2eea522c..19adbb418443 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
/* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled)
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2595f9..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
-static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_pkru(vcpu);
-}
-
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2819d4c123eb..589dcc117086 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,11 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
- preempt_enable();
}
static bool start_hv_timer(struct kvm_lapic *apic)
@@ -1507,6 +1506,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
struct kvm_timer *ktimer = &apic->lapic_timer;
int r;
+ WARN_ON(preemptible());
if (!kvm_x86_ops->set_hv_timer)
return false;
@@ -1538,6 +1538,8 @@ static bool start_hv_timer(struct kvm_lapic *apic)
static void start_sw_timer(struct kvm_lapic *apic)
{
struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
if (apic->lapic_timer.hv_timer_in_use)
cancel_hv_timer(apic);
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
@@ -1552,15 +1554,20 @@ static void start_sw_timer(struct kvm_lapic *apic)
static void restart_apic_timer(struct kvm_lapic *apic)
{
+ preempt_disable();
if (!start_hv_timer(apic))
start_sw_timer(apic);
+ preempt_enable();
}
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
WARN_ON(swait_active(&vcpu->wq));
cancel_hv_timer(apic);
apic_timer_expired(apic);
@@ -1569,6 +1576,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
advance_periodic_target_expiration(apic);
restart_apic_timer(apic);
}
+out:
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
@@ -1582,9 +1591,11 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ preempt_disable();
/* Possibly the TSC deadline timer is not enabled yet */
if (apic->lapic_timer.hv_timer_in_use)
start_sw_timer(apic);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..4b9a3ae6b725 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* index of the protection domain, so pte_pkey * 2 is
* is the index of the first bit for the domain.
*/
- pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
offset = (pfec & ~1) +
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4d8141e533c3..af256b786a70 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void)
if (vls) {
if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
@@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
-static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
-
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -2430,6 +2425,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+ * written only when inject_pending_event runs (DR6 would written here
+ * too). This should be conditional on a new capability---if the
+ * capability is disabled, kvm_multiple_exception would write the
+ * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ */
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else
@@ -5403,8 +5408,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .get_pkru = svm_get_pkru,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29fd8af5c347..c6ef2940119b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -416,13 +416,10 @@ struct nested_vmx {
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
- /* The host-usable pointer to the above */
- struct page *current_vmcs12_page;
- struct vmcs12 *current_vmcs12;
/*
* Cache of the guest's VMCS, existing outside of guest memory.
* Loaded from guest memory during VMPTRLD. Flushed to guest
- * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ * memory during VMCLEAR and VMPTRLD.
*/
struct vmcs12 *cached_vmcs12;
/*
@@ -563,7 +560,6 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
- bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -640,8 +636,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- bool guest_pkru_valid;
- u32 guest_pkru;
u32 host_pkru;
/*
@@ -928,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2383,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->guest_pkru;
-}
-
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2429,6 +2422,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2438,23 +2455,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
- if (!((vmcs12->exception_bitmap & (1u << nr)) ||
- (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
- return 0;
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ nested_vmx_inject_exception_vmexit(vcpu,
+ vcpu->arch.apf.nested_apf_token);
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+ return 1;
+ }
+ } else {
+ unsigned long exit_qual = 0;
+ if (nr == DB_VECTOR)
+ exit_qual = vcpu->arch.dr6;
- if (vcpu->arch.exception.nested_apf) {
- vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
- INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
- vcpu->arch.apf.nested_apf_token);
- return 1;
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 1;
+ }
}
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
+ return 0;
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2668,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_handled() will not pass related exits to L1.
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
* These rules have exceptions below.
*/
@@ -4956,6 +4988,28 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
}
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4963,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
void *vapic_page;
u16 status;
- if (vmx->nested.pi_desc &&
- vmx->nested.pi_pending) {
- vmx->nested.pi_pending = false;
- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
-
- max_irr = find_last_bit(
- (unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
- if (max_irr == 256)
- return;
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
vapic_page = kmap(vmx->nested.virtual_apic_page);
__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
kunmap(vmx->nested.virtual_apic_page);
@@ -4986,11 +5037,16 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ bool nested)
{
#ifdef CONFIG_SMP
+ int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
if (vcpu->mode == IN_GUEST_MODE) {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5008,8 +5064,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
- POSTED_INTR_VECTOR);
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return true;
}
#endif
@@ -5024,7 +5079,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/* the PIR and ON have been set by L1. */
- kvm_vcpu_trigger_posted_interrupt(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, true);
/*
* If a posted intr is not recognized by hardware,
* we will accomplish it in the next vmentry.
@@ -5058,7 +5113,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return;
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
}
@@ -7133,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
if (vmx->nested.current_vmptr == -1ull)
return;
- /* current_vmptr and current_vmcs12 are always set/reset together */
- if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
- return;
-
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Flush VMCS12 to guest memory */
- memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
- VMCS12_SIZE);
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -7174,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.vmxon = false;
free_vpid(vmx->nested.vpid02);
- nested_release_vmcs12(vmx);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
if (vmx->nested.msr_bitmap) {
free_page((unsigned long)vmx->nested.msr_bitmap);
vmx->nested.msr_bitmap = NULL;
}
if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
free_vmcs(vmx->vmcs01.shadow_vmcs);
vmx->vmcs01.shadow_vmcs = NULL;
@@ -7578,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmcs12 = new_vmcs12;
- vmx->nested.current_vmcs12_page = page;
/*
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12,
- vmx->nested.current_vmcs12, VMCS12_SIZE);
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ nested_release_page_clean(page);
+
set_current_vmptr(vmx, vmptr);
}
@@ -8018,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* should handle it ourselves in L0 (and then continue L2). Only call this
* when in is_guest_mode (L2).
*/
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
{
u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 exit_reason = vmx->exit_reason;
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
@@ -8032,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+ * may write to these pages via their host physical address while
+ * L2 is running, bypassing any address-translation-based dirty
+ * tracking (e.g. EPT write protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
if (vmx->nested.nested_run_pending)
return false;
@@ -8168,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
}
}
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8414,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
- nested_vmx_vmexit(vcpu, exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
- }
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
dump_vmcs();
@@ -8928,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (vmx->guest_pkru_valid)
- __write_pkru(vmx->guest_pkru);
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -9077,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
- if (boot_cpu_has(X86_FEATURE_OSPKE)) {
- vmx->guest_pkru = __read_pkru();
- if (vmx->guest_pkru != vmx->host_pkru) {
- vmx->guest_pkru_valid = true;
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
- } else
- vmx->guest_pkru_valid = false;
}
/*
@@ -9222,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
@@ -9508,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- else
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
kvm_inject_page_fault(vcpu, fault);
+ }
}
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -10041,6 +10128,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
@@ -10065,13 +10154,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
- /*
- * Note that we use L0's vector here and in
- * vmx_deliver_nested_posted_interrupt.
- */
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
@@ -10095,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
- *
- * A problem with this approach (when !enable_ept) is that L1 may be
- * injected with more page faults than it asked for. This could have
- * caused problems, but in practice existing hypervisors don't care.
- * To fix this, we will need to emulate the PFEC checking (on the L1
- * page tables), using walk_addr(), when injecting PFs to L1.
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10848,13 +10927,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
-
vmcs12->vm_exit_intr_info = exit_intr_info;
- if ((vmcs12->vm_exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
vmcs12->idt_vectoring_info_field = 0;
vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -10942,7 +11016,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
*/
vmx_flush_tlb(vcpu);
}
-
+ /* Restore posted intr vector. */
+ if (nested_cpu_has_posted_intr(vmcs12))
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11048,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
- && nested_exit_intr_ack_set(vcpu)) {
+ /*
+ * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+ * the VM-exit interrupt information (valid interrupt) is always set to
+ * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+ * kvm_cpu_has_interrupt(). See the commit message for details.
+ */
+ if (nested_exit_intr_ack_set(vcpu) &&
+ exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -11592,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .get_pkru = vmx_get_pkru,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 82a63c59f77b..272320eb328c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -597,8 +597,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (kvm_read_cr3(vcpu) & ~31ul) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & ~31ul) & (PAGE_SIZE - 1);
+ gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
@@ -3159,15 +3159,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_set_hflags(vcpu, hflags);
vcpu->arch.smi_pending = events->smi.pending;
- if (events->smi.smm_inside_nmi)
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
}
}
@@ -3242,7 +3245,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest + offset, src, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
}
valid -= feature;
@@ -3280,7 +3288,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest, src + offset, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
}
valid -= feature;
@@ -6215,6 +6227,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
+ lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -6721,17 +6734,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
- /*
- * The physical address of apic access page is stored in the VMCS.
- * Update it when it becomes invalid.
- */
- if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -7629,7 +7631,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
trace_kvm_fpu(1);
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 673541eb3b3f..bf3f1065d6ad 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,7 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
#include <asm/kaslr.h>
+#include <asm/hypervisor.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -636,6 +637,8 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
+ hypervisor_init_mem_mapping();
+
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 229d04a83f85..a88cfbfbd078 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -50,8 +50,7 @@ unsigned long tasksize_64bit(void)
static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
- if ((current->flags & PF_RANDOMIZE) &&
- !(current->personality & ADDR_NO_RANDOMIZE)) {
+ if (current->flags & PF_RANDOMIZE) {
max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
max <<= PAGE_SHIFT;
}
@@ -79,13 +78,13 @@ static int mmap_is_legacy(void)
static unsigned long arch_rnd(unsigned int rndbits)
{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}
unsigned long arch_mmap_rnd(void)
{
- if (!(current->flags & PF_RANDOMIZE))
- return 0;
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3e4bdb442fbc..f44c0bc95aa2 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -26,7 +26,7 @@
static struct bau_operations ops __ro_after_init;
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-static int timeout_base_ns[] = {
+static const int timeout_base_ns[] = {
20,
160,
1280,
@@ -1216,7 +1216,7 @@ static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
* set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
* Such a message must be ignored.
*/
-void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
{
unsigned long mmr_image;
unsigned char swack_vec;
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ae4cd58c0c7a..02250b2633b8 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -50,7 +50,7 @@ void foo(void)
DEFINE(HOST_GS, GS);
DEFINE(HOST_ORIG_AX, ORIG_EAX);
#else
-#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET)
+#ifdef FP_XSTATE_MAGIC1
DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
#else
DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 87d791356ea9..de503c225ae1 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -12,6 +12,7 @@
#include <asm/setup.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
+#include <asm/early_ioremap.h>
#include <asm/xen/cpuid.h>
#include <asm/xen/hypervisor.h>
@@ -21,38 +22,50 @@
#include "mmu.h"
#include "smp.h"
-void __ref xen_hvm_init_shared_info(void)
+static unsigned long shared_info_pfn;
+
+void xen_hvm_init_shared_info(void)
{
struct xen_add_to_physmap xatp;
- u64 pa;
-
- if (HYPERVISOR_shared_info == &xen_dummy_shared_info) {
- /*
- * Search for a free page starting at 4kB physical address.
- * Low memory is preferred to avoid an EPT large page split up
- * by the mapping.
- * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
- * the BIOS used for HVM guests is well behaved and won't
- * clobber memory other than the first 4kB.
- */
- for (pa = PAGE_SIZE;
- !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
- memblock_is_reserved(pa);
- pa += PAGE_SIZE)
- ;
-
- memblock_reserve(pa, PAGE_SIZE);
- HYPERVISOR_shared_info = __va(pa);
- }
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info);
+ xatp.gpfn = shared_info_pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
}
+static void __init reserve_shared_info(void)
+{
+ u64 pa;
+
+ /*
+ * Search for a free page starting at 4kB physical address.
+ * Low memory is preferred to avoid an EPT large page split up
+ * by the mapping.
+ * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+ * the BIOS used for HVM guests is well behaved and won't
+ * clobber memory other than the first 4kB.
+ */
+ for (pa = PAGE_SIZE;
+ !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+ memblock_is_reserved(pa);
+ pa += PAGE_SIZE)
+ ;
+
+ shared_info_pfn = PHYS_PFN(pa);
+
+ memblock_reserve(pa, PAGE_SIZE);
+ HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
+}
+
+static void __init xen_hvm_init_mem_mapping(void)
+{
+ early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
+ HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+}
+
static void __init init_hvm_pv_info(void)
{
int major, minor;
@@ -153,6 +166,7 @@ static void __init xen_hvm_guest_init(void)
init_hvm_pv_info();
+ reserve_shared_info();
xen_hvm_init_shared_info();
/*
@@ -218,5 +232,6 @@ const struct hypervisor_x86 x86_hyper_xen_hvm = {
.init_platform = xen_hvm_guest_init,
.pin_vcpu = xen_pin_vcpu,
.x2apic_available = xen_x2apic_para_available,
+ .init_mem_mapping = xen_hvm_init_mem_mapping,
};
EXPORT_SYMBOL(x86_hyper_xen_hvm);
OpenPOWER on IntegriCloud