summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig40
-rw-r--r--arch/x86/boot/compressed/eboot.c6
-rw-r--r--arch/x86/boot/compressed/head_64.S79
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c14
-rw-r--r--arch/x86/entry/entry_64_compat.S8
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso-fakesections.c1
-rw-r--r--arch/x86/events/core.c8
-rw-r--r--arch/x86/events/intel/core.c9
-rw-r--r--arch/x86/events/intel/cstate.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c37
-rw-r--r--arch/x86/events/msr.c9
-rw-r--r--arch/x86/include/asm/asm.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h15
-rw-r--r--arch/x86/include/asm/cpufeatures.h21
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/ftrace.h19
-rw-r--r--arch/x86/include/asm/insn.h18
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/jailhouse_para.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/nospec-branch.h43
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pkeys.h18
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/spec-ctrl.h80
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/x86/include/uapi/asm/msgbuf.h31
-rw-r--r--arch/x86/include/uapi/asm/shmbuf.h42
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/amd_nb.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apm_32.c15
-rw-r--r--arch/x86/kernel/cpu/amd.c22
-rw-r--r--arch/x86/kernel/cpu/bugs.c397
-rw-r--r--arch/x86/kernel/cpu/common.c73
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c29
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c2
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/jailhouse.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c5
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kvm.c8
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/machine_kexec_32.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c5
-rw-r--r--arch/x86/kernel/pci-dma.c58
-rw-r--r--arch/x86/kernel/pci-nommu.c90
-rw-r--r--arch/x86/kernel/process.c146
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c52
-rw-r--r--arch/x86/kernel/tsc.c24
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kvm/cpuid.c26
-rw-r--r--arch/x86/kvm/hyperv.c23
-rw-r--r--arch/x86/kvm/lapic.c53
-rw-r--r--arch/x86/kvm/svm.c97
-rw-r--r--arch/x86/kvm/vmx.c168
-rw-r--r--arch/x86/kvm/x86.c47
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/mm/dump_pagetables.c11
-rw-r--r--arch/x86/mm/pageattr.c44
-rw-r--r--arch/x86/mm/pkeys.c21
-rw-r--r--arch/x86/mm/pti.c26
-rw-r--r--arch/x86/net/bpf_jit_comp.c18
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/xen/enlighten_hvm.c13
-rw-r--r--arch/x86/xen/enlighten_pv.c86
-rw-r--r--arch/x86/xen/mmu.c4
-rw-r--r--arch/x86/xen/mmu_pv.c4
78 files changed, 1574 insertions, 616 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81f2c56..f2ee6a8ffe65 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,8 @@ config X86_64
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select SWIOTLB
select X86_DEV_DMA_OPS
select ARCH_HAS_SYSCALL_WRAPPER
@@ -52,6 +54,7 @@ config X86
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
@@ -133,7 +136,6 @@ config X86
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
- select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
@@ -183,6 +185,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
+ select NEED_SG_DMA_LENGTH
select PCI_LOCKLESS_CONFIG
select PERF_EVENTS
select RTC_LIB
@@ -235,13 +238,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
config SBUS
bool
-config NEED_DMA_MAP_STATE
- def_bool y
- depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
-
-config NEED_SG_DMA_LENGTH
- def_bool y
-
config GENERIC_ISA_DMA
def_bool y
depends on ISA_DMA_API
@@ -273,6 +269,9 @@ config ARCH_HAS_CPU_RELAX
config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
+config ARCH_HAS_FILTER_PGPROT
+ def_bool y
+
config HAVE_SETUP_PER_CPU_AREA
def_bool y
@@ -871,6 +870,7 @@ config DMI
config GART_IOMMU
bool "Old AMD GART IOMMU support"
+ select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
---help---
@@ -892,6 +892,7 @@ config GART_IOMMU
config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
+ select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI
---help---
@@ -919,20 +920,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
-# need this always selected by IOMMU for the VIA workaround
-config SWIOTLB
- def_bool y if X86_64
- ---help---
- Support for software bounce buffers used on x86-64 systems
- which don't have a hardware IOMMU. Using this PCI devices
- which can only access 32-bits of memory can be used on systems
- with more than 3 GB of memory.
- If unsure, say Y.
-
-config IOMMU_HELPER
- def_bool y
- depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
-
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
@@ -1454,6 +1441,7 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
+ select PHYS_ADDR_T_64BIT
select SWIOTLB
---help---
PAE is required for NX support, and furthermore enables
@@ -1481,14 +1469,6 @@ config X86_5LEVEL
Say N if unsure.
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool y
- depends on X86_64 || X86_PAE
-
-config ARCH_DMA_ADDR_T_64BIT
- def_bool y
- depends on X86_64 || HIGHMEM64G
-
config X86_DIRECT_GBPAGES
def_bool y
depends on X86_64 && !DEBUG_PAGEALLOC
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 47d3efff6805..09f36c0d9d4f 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -163,7 +163,8 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
if (status != EFI_SUCCESS)
goto free_struct;
- memcpy(rom->romdata, pci->romimage, pci->romsize);
+ memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
+ pci->romsize);
return status;
free_struct:
@@ -269,7 +270,8 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
if (status != EFI_SUCCESS)
goto free_struct;
- memcpy(rom->romdata, pci->romimage, pci->romsize);
+ memcpy(rom->romdata, (void *)(unsigned long)pci->romimage,
+ pci->romsize);
return status;
free_struct:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fca012baba19..8169e8b7a4dc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -306,6 +306,25 @@ ENTRY(startup_64)
leaq boot_stack_end(%rbx), %rsp
/*
+ * paging_prepare() and cleanup_trampoline() below can have GOT
+ * references. Adjust the table with address we are running at.
+ *
+ * Zero RAX for adjust_got: the GOT was not adjusted before;
+ * there's no adjustment to undo.
+ */
+ xorq %rax, %rax
+
+ /*
+ * Calculate the address the binary is loaded at and use it as
+ * a GOT adjustment.
+ */
+ call 1f
+1: popq %rdi
+ subq $1b, %rdi
+
+ call adjust_got
+
+ /*
* At this point we are in long mode with 4-level paging enabled,
* but we might want to enable 5-level paging or vice versa.
*
@@ -370,10 +389,14 @@ trampoline_return:
/*
* cleanup_trampoline() would restore trampoline memory.
*
+ * RDI is address of the page table to use instead of page table
+ * in trampoline memory (if required).
+ *
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
+ leaq top_pgtable(%rbx), %rdi
call cleanup_trampoline
popq %rsi
@@ -381,6 +404,21 @@ trampoline_return:
pushq $0
popfq
+ /*
+ * Previously we've adjusted the GOT with address the binary was
+ * loaded at. Now we need to re-adjust for relocation address.
+ *
+ * Calculate the address the binary is loaded at, so that we can
+ * undo the previous GOT adjustment.
+ */
+ call 1f
+1: popq %rax
+ subq $1b, %rax
+
+ /* The new adjustment is the relocation address */
+ movq %rbx, %rdi
+ call adjust_got
+
/*
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
@@ -482,19 +520,6 @@ relocated:
rep stosq
/*
- * Adjust our own GOT
- */
- leaq _got(%rip), %rdx
- leaq _egot(%rip), %rcx
-1:
- cmpq %rcx, %rdx
- jae 2f
- addq %rbx, (%rdx)
- addq $8, %rdx
- jmp 1b
-2:
-
-/*
* Do the extraction, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
@@ -512,6 +537,27 @@ relocated:
*/
jmp *%rax
+/*
+ * Adjust the global offset table
+ *
+ * RAX is the previous adjustment of the table to undo (use 0 if it's the
+ * first time we touch GOT).
+ * RDI is the new adjustment to apply.
+ */
+adjust_got:
+ /* Walk through the GOT adding the address to the entries */
+ leaq _got(%rip), %rdx
+ leaq _egot(%rip), %rcx
+1:
+ cmpq %rcx, %rdx
+ jae 2f
+ subq %rax, (%rdx) /* Undo previous adjustment */
+ addq %rdi, (%rdx) /* Apply the new adjustment */
+ addq $8, %rdx
+ jmp 1b
+2:
+ ret
+
.code32
/*
* This is the 32-bit trampoline that will be copied over to low memory.
@@ -649,3 +695,10 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+top_pgtable:
+ .fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 32af1cbcd903..a362fa0b849c 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -23,14 +23,6 @@ struct paging_config {
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
/*
- * The page table is going to be used instead of page table in the trampoline
- * memory.
- *
- * It must not be in BSS as BSS is cleared after cleanup_trampoline().
- */
-static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
-
-/*
* Trampoline address will be printed by extract_kernel() for debugging
* purposes.
*
@@ -134,7 +126,7 @@ out:
return paging_config;
}
-void cleanup_trampoline(void)
+void cleanup_trampoline(void *pgtable)
{
void *trampoline_pgtable;
@@ -145,8 +137,8 @@ void cleanup_trampoline(void)
* if it's there.
*/
if ((void *)__native_read_cr3() == trampoline_pgtable) {
- memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
- native_write_cr3((unsigned long)top_pgtable);
+ memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
}
/* Restore trampoline memory */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 9af927e59d49..9de7f1e1dede 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -84,13 +84,13 @@ ENTRY(entry_SYSENTER_compat)
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r8 */
xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
+ pushq %r9 /* pt_regs->r9 */
xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
+ pushq %r10 /* pt_regs->r10 */
xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
+ pushq %r11 /* pt_regs->r11 */
xorl %r11d, %r11d /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
xorl %ebx, %ebx /* nospec rbx */
diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
deleted file mode 100644
index 541468e25265..000000000000
--- a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../vdso-fakesections.c"
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a6006e7bb729..45b2b1c93d04 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/device.h>
+#include <linux/nospec.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -304,17 +305,20 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
config = attr->config;
- cache_type = (config >> 0) & 0xff;
+ cache_type = (config >> 0) & 0xff;
if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
return -EINVAL;
+ cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
cache_op = (config >> 8) & 0xff;
if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
return -EINVAL;
+ cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
cache_result = (config >> 16) & 0xff;
if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
return -EINVAL;
+ cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
val = hw_cache_event_ids[cache_type][cache_op][cache_result];
@@ -421,6 +425,8 @@ int x86_setup_perfctr(struct perf_event *event)
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
+ attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
+
/*
* The generic map:
*/
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 607bf565a90c..707b2a96e516 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3339,7 +3339,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
- flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+ if (x86_pmu.version > 1)
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
if (!cpuc->shared_regs)
return;
@@ -3502,6 +3503,8 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_dying = intel_pmu_cpu_dying,
};
+static struct attribute *intel_pmu_attrs[];
+
static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
@@ -3533,6 +3536,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.format_attrs = intel_arch3_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
+ .attrs = intel_pmu_attrs,
+
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
@@ -3911,8 +3916,6 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
- x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9aca448bb8e6..9f8084f18d58 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -92,6 +92,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/nospec.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include "../perf_event.h"
@@ -302,6 +303,7 @@ static int cstate_pmu_event_init(struct perf_event *event)
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!pkg_msr[cfg].attr)
return -EINVAL;
event->hw.event_base = pkg_msr[cfg].msr;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c98b943e58b4..77076a102e34 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3028,10 +3028,27 @@ static struct intel_uncore_type bdx_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX 3
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
&hswep_uncore_pcu,
+ &bdx_uncore_sbox,
NULL,
};
@@ -3043,10 +3060,25 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
void bdx_uncore_cpu_init(void)
{
+ int pkg = topology_phys_to_logical_pkg(0);
+
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
+ /* BDX-DE doesn't have SBOX */
+ if (boot_cpu_data.x86_model == 86) {
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+ /* Detect systems with no SBOXes */
+ } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+ struct pci_dev *pdev;
+ u32 capid4;
+
+ pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
+ pci_read_config_dword(pdev, 0x94, &capid4);
+ if (((capid4 >> 6) & 0x3) == 0)
+ bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+ }
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
@@ -3264,6 +3296,11 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
},
+ { /* PCU.3 (for Capability registers) */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ HSWEP_PCI_PCU_3),
+ },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index e7edf19e64c2..b4771a6ddbc1 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/perf_event.h>
+#include <linux/nospec.h>
#include <asm/intel-family.h>
enum perf_msr_id {
@@ -158,9 +159,6 @@ static int msr_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
- if (cfg >= PERF_MSR_EVENT_MAX)
- return -EINVAL;
-
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
@@ -171,6 +169,11 @@ static int msr_event_init(struct perf_event *event)
event->attr.sample_period) /* no sampling */
return -EINVAL;
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX);
+
if (!msr[cfg].attr)
return -EINVAL;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 386a6900e206..219faaec51df 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,7 +136,6 @@
#endif
#ifndef __ASSEMBLY__
-#ifndef __BPF__
/*
* This output constraint should be used for any inline asm which has a "call"
* instruction. Otherwise the asm may be inserted before the frame pointer
@@ -146,6 +145,5 @@
register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif
-#endif
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b27da9602a6d..aced6c9290d6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -140,6 +140,20 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO)
+
+/*
+ * Workaround for the sake of BPF compilation which utilizes kernel
+ * headers, but clang does not support ASM GOTO and fails the build.
+ */
+#ifndef __BPF_TRACING__
+#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
+#endif
+
+#define static_cpu_has(bit) boot_cpu_has(bit)
+
+#else
+
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
* These will statically patch the target code for additional
@@ -195,6 +209,7 @@ t_no:
boot_cpu_has(bit) : \
_static_cpu_has(bit) \
)
+#endif
#define cpu_has_bug(c, bit) cpu_has(c, (bit))
#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d554c11e01ff..fb00a2fca990 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
-
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
@@ -207,13 +206,19 @@
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
-
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
-
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -274,9 +279,10 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -320,6 +326,7 @@
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
@@ -333,6 +340,7 @@
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
/*
* BUG word(s)
@@ -362,5 +370,6 @@
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 89ce4bfd241f..ce4d176b3d13 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
return dma_ops;
}
-int arch_dma_supported(struct device *dev, u64 mask);
-#define arch_dma_supported arch_dma_supported
-
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+bool arch_dma_alloc_attrs(struct device **dev);
#define arch_dma_alloc_attrs arch_dma_alloc_attrs
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 09ad88572746..cc8f8fcf9b4a 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -46,7 +46,21 @@ int ftrace_int3_handler(struct pt_regs *regs);
#endif /* CONFIG_FUNCTION_TRACER */
-#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS)
+#ifndef __ASSEMBLY__
+
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Compare the symbol name with the system call name. Skip the
+ * "__x64_sys", "__ia32_sys" or simple "sys" prefix.
+ */
+ return !strcmp(sym + 3, name + 3) ||
+ (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
+ (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3));
+}
+
+#ifndef COMPILE_OFFSETS
#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
#include <asm/compat.h>
@@ -67,6 +81,7 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
return false;
}
#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
-#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */
+#endif /* !COMPILE_OFFSETS */
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index b3e32b010ab1..c2c01f84df75 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
return insn_offset_displacement(insn) + insn->displacement.nbytes;
}
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+ return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+ (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 404c5fdff859..548d90bbf919 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,11 +34,6 @@
* (0x80 is the syscall vector, 0x30-0x3f are for ISA)
*/
#define FIRST_EXTERNAL_VECTOR 0x20
-/*
- * We start allocating at 0x21 to spread out vectors evenly between
- * priority levels. (0x80 is the syscall vector)
- */
-#define VECTOR_OFFSET_START 1
/*
* Reserve the lowest usable vector (and hence lowest priority) 0x20 for
@@ -119,8 +114,6 @@
#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
-#define FPU_IRQ 13
-
/*
* Size the maximum number of interrupts.
*
diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h
index b885a961a150..a34897aef2c2 100644
--- a/arch/x86/include/asm/jailhouse_para.h
+++ b/arch/x86/include/asm/jailhouse_para.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL2.0 */
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Jailhouse paravirt detection
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 949c977bc4c9..f4b2588865e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -924,7 +924,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
- bool (*cpu_has_high_real_mode_segbase)(void);
+ bool (*has_emulated_msr)(int index);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
struct kvm *(*vm_alloc)(void);
@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
+ u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 57e3785d0d26..cf9911b5a53c 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk,
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
- /* pkey 0 is the default and always allocated */
+ /* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b9255e..fda2114197b3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,6 +42,8 @@
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
+#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
@@ -68,6 +70,11 @@
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
+#define ARCH_CAP_SSB_NO (1 << 4) /*
+ * Not susceptible to Speculative Store Bypass
+ * attack, so no Speculative Store Bypass
+ * control required.
+ */
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
@@ -340,6 +347,8 @@
#define MSR_AMD64_SEV_ENABLED_BIT 0
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9b143f..8b38df98548e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,6 +217,14 @@ enum spectre_v2_mitigation {
SPECTRE_V2_IBRS,
};
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+ SPEC_STORE_BYPASS_NONE,
+ SPEC_STORE_BYPASS_DISABLE,
+ SPEC_STORE_BYPASS_PRCTL,
+ SPEC_STORE_BYPASS_SECCOMP,
+};
+
extern char __indirect_thunk_start[];
extern char __indirect_thunk_end[];
@@ -241,22 +249,27 @@ static inline void vmexit_fill_RSB(void)
#endif
}
-#define alternative_msr_write(_msr, _val, _feature) \
- asm volatile(ALTERNATIVE("", \
- "movl %[msr], %%ecx\n\t" \
- "movl %[val], %%eax\n\t" \
- "movl $0, %%edx\n\t" \
- "wrmsr", \
- _feature) \
- : : [msr] "i" (_msr), [val] "i" (_val) \
- : "eax", "ecx", "edx", "memory")
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+ asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+ : : "c" (msr),
+ "a" ((u32)val),
+ "d" ((u32)(val >> 32)),
+ [feature] "i" (feature)
+ : "memory");
+}
static inline void indirect_branch_prediction_barrier(void)
{
- alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
- X86_FEATURE_USE_IBPB);
+ u64 val = PRED_CMD_IBPB;
+
+ alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
}
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
/*
* With retpoline, we must use IBRS to restrict branch prediction
* before calling into firmware.
@@ -265,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void)
*/
#define firmware_restrict_branch_speculation_start() \
do { \
+ u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \
+ \
preempt_disable(); \
- alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS, \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
X86_FEATURE_USE_IBRS_FW); \
} while (0)
#define firmware_restrict_branch_speculation_end() \
do { \
- alternative_msr_write(MSR_IA32_SPEC_CTRL, 0, \
+ u64 val = x86_spec_ctrl_base; \
+ \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
X86_FEATURE_USE_IBRS_FW); \
preempt_enable(); \
} while (0)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d32175e30259..662963681ea6 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#endif
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5f49b4ff0c24..f1633de5a675 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -601,6 +601,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return canon_pgprot(prot);
+}
+
static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
enum page_cache_mode pcm,
enum page_cache_mode new_pcm)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index d5c21a382475..adb47552e6bb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -105,14 +105,14 @@ extern unsigned int ptrs_per_p4d;
#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
-#define __VMALLOC_BASE_L4 0xffffc90000000000
-#define __VMALLOC_BASE_L5 0xffa0000000000000
+#define __VMALLOC_BASE_L4 0xffffc90000000000UL
+#define __VMALLOC_BASE_L5 0xffa0000000000000UL
#define VMALLOC_SIZE_TB_L4 32UL
#define VMALLOC_SIZE_TB_L5 12800UL
-#define __VMEMMAP_BASE_L4 0xffffea0000000000
-#define __VMEMMAP_BASE_L5 0xffd4000000000000
+#define __VMEMMAP_BASE_L4 0xffffea0000000000UL
+#define __VMEMMAP_BASE_L5 0xffd4000000000000UL
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
# define VMALLOC_START vmalloc_base
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ffda0df..851c04b7a092 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
+#define ARCH_DEFAULT_PKEY 0
+
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
@@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return 0;
+ return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
}
@@ -49,13 +51,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
/*
* "Allocated" pkeys are those that have been returned
- * from pkey_alloc(). pkey 0 is special, and never
- * returned from pkey_alloc().
+ * from pkey_alloc() or pkey 0 which is allocated
+ * implicitly when the mm is created.
*/
- if (pkey <= 0)
+ if (pkey < 0)
return false;
if (pkey >= arch_max_pkey())
return false;
+ /*
+ * The exec-only pkey is set in the allocation map, but
+ * is not available to any of the user interfaces like
+ * mprotect_pkey().
+ */
+ if (pkey == mm->context.execute_only_pkey)
+ return false;
+
return mm_pkey_allocation_map(mm) & (1U << pkey);
}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa4206029e3..21a114914ba4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -749,13 +749,11 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
-extern void early_trap_init(void);
void early_trap_pf_init(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
-extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
new file mode 100644
index 000000000000..ae7c2c5cd7f0
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <linux/thread_info.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter and also
+ * the guest's version of VIRT_SPEC_CTRL, if emulated.
+ */
+extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest);
+
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true);
+}
+
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false);
+}
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_ssbd_mask;
+
+static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
+{
+ return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
+}
+
+#ifdef CONFIG_SMP
+extern void speculative_store_bypass_ht_init(void);
+#else
+static inline void speculative_store_bypass_ht_init(void) { }
+#endif
+
+extern void speculative_store_bypass_update(unsigned long tif);
+
+static inline void speculative_store_bypass_update_current(void)
+{
+ speculative_store_bypass_update(current_thread_info()->flags);
+}
+
+#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a5d9521bb2cb..2ff2a30a264f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
+#define TIF_SSBD 5 /* Reduced data speculation */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
@@ -105,6 +106,7 @@ struct thread_info {
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_SSBD (1 << TIF_SSBD)
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
@@ -144,7 +146,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
+ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 4c851ebb3ceb..0ede697c3961 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,7 +29,7 @@
#define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
-#define KVM_HINTS_DEDICATED 0
+#define KVM_HINTS_REALTIME 0
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
index 809134c644a6..90ab9a795b49 100644
--- a/arch/x86/include/uapi/asm/msgbuf.h
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -1 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X64_MSGBUF_H
+#define __ASM_X64_MSGBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
#include <asm-generic/msgbuf.h>
+#else
+/*
+ * The msqid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ __kernel_time_t msg_stime; /* last msgsnd time */
+ __kernel_time_t msg_rtime; /* last msgrcv time */
+ __kernel_time_t msg_ctime; /* last change time */
+ __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */
+ __kernel_ulong_t msg_qnum; /* number of messages in queue */
+ __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+#endif
+
+#endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
index 83c05fc2de38..644421f3823b 100644
--- a/arch/x86/include/uapi/asm/shmbuf.h
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -1 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_SHMBUF_H
+#define __ASM_X86_SHMBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
#include <asm-generic/shmbuf.h>
+#else
+/*
+ * The shmid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_time_t shm_atime; /* last attach time */
+ __kernel_time_t shm_dtime; /* last detach time */
+ __kernel_time_t shm_ctime; /* last change time */
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ __kernel_ulong_t shm_nattch; /* no. of current attaches */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+struct shminfo64 {
+ __kernel_ulong_t shmmax;
+ __kernel_ulong_t shmmin;
+ __kernel_ulong_t shmmni;
+ __kernel_ulong_t shmseg;
+ __kernel_ulong_t shmall;
+ __kernel_ulong_t __unused1;
+ __kernel_ulong_t __unused2;
+ __kernel_ulong_t __unused3;
+ __kernel_ulong_t __unused4;
+};
+
+#endif
+
+#endif /* __ASM_X86_SHMBUF_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dde444f932c1..3b20607d581b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -215,6 +215,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
apic_id = processor->local_apic_id;
enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+ /* Ignore invalid ID */
+ if (apic_id == 0xffffffff)
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c88e0b127810..b481b95bd8f6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -14,8 +14,11 @@
#include <asm/amd_nb.h>
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -24,6 +27,7 @@ static u32 *flush_words;
static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{}
};
@@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{}
};
@@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8b04234e010b..7685444a106b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
goto update;
}
cmsk = cluster_hotplug_mask;
+ cmsk->clusterid = cluster;
cluster_hotplug_mask = NULL;
update:
this_cpu_write(cluster_masks, cmsk);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dfcbe6924eaf..cadeafabf167 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
return 0;
}
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
- .owner = THIS_MODULE,
- .open = proc_apm_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int apm(void *unused)
{
unsigned short bx;
@@ -2360,7 +2347,7 @@ static int __init apm_init(void)
set_desc_base(&gdt[APM_DS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
- proc_create("apm", 0, NULL, &apm_file_ops);
+ proc_create_single("apm", 0, NULL, proc_apm_show);
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 12bc0a1139da..1b18be3f35a8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -10,6 +10,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/pci-direct.h>
#include <asm/delay.h>
@@ -554,6 +555,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
rdmsrl(MSR_FAM10H_NODE_ID, value);
nodes_per_socket = ((value >> 3) & 7) + 1;
}
+
+ if (c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -791,6 +812,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
static void init_amd_zn(struct cpuinfo_x86 *c)
{
+ set_cpu_cap(c, X86_FEATURE_ZEN);
/*
* Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
* all up to and including B1.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bfca937bdcc3..7416fc206b4a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,8 +12,10 @@
#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
@@ -27,6 +29,27 @@
#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
+
+/*
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+ * writes to SPEC_CTRL contain whatever reserved bits have been set.
+ */
+u64 __ro_after_init x86_spec_ctrl_base;
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
void __init check_bugs(void)
{
@@ -37,9 +60,27 @@ void __init check_bugs(void)
print_cpu_info(&boot_cpu_data);
}
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /* Allow STIBP in MSR_SPEC_CTRL if supported */
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+
/* Select the proper spectre mitigation before patching alternatives */
spectre_v2_select_mitigation();
+ /*
+ * Select proper mitigation for any exposure to the Speculative Store
+ * Bypass vulnerability.
+ */
+ ssb_select_mitigation();
+
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@@ -93,7 +134,76 @@ static const char *spectre_v2_strings[] = {
#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
-static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
+
+void
+x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+ struct thread_info *ti = current_thread_info();
+
+ /* Is MSR_SPEC_CTRL implemented ? */
+ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+ /*
+ * Restrict guest_spec_ctrl to supported values. Clear the
+ * modifiable bits in the host base value and or the
+ * modifiable bits from the guest value.
+ */
+ guestval = hostval & ~x86_spec_ctrl_mask;
+ guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+
+ /* SSBD controlled in MSR_SPEC_CTRL */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+
+ if (hostval != guestval) {
+ msrval = setguest ? guestval : hostval;
+ wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+ }
+ }
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculative_store_bypass_update(tif);
+ }
+}
+EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
#ifdef RETPOLINE
static bool spectre_v2_bad_module;
@@ -312,32 +422,289 @@ retpoline_auto:
}
#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+ SPEC_STORE_BYPASS_CMD_NONE,
+ SPEC_STORE_BYPASS_CMD_AUTO,
+ SPEC_STORE_BYPASS_CMD_ON,
+ SPEC_STORE_BYPASS_CMD_PRCTL,
+ SPEC_STORE_BYPASS_CMD_SECCOMP,
+};
+
+static const char *ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static const struct {
+ const char *option;
+ enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] = {
+ { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
+ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
+ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+ { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
+ { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+ enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ char arg[20];
+ int ret, i;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+ return SPEC_STORE_BYPASS_CMD_NONE;
+ } else {
+ ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+ if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+ continue;
+
+ cmd = ssb_mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+ }
+ }
+
+ return cmd;
+}
+
+static enum ssb_mitigation __init __ssb_select_mitigation(void)
+{
+ enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+ enum ssb_mitigation_cmd cmd;
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ return mode;
+
+ cmd = ssb_parse_cmdline();
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+ (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+ cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+ return mode;
+
+ switch (cmd) {
+ case SPEC_STORE_BYPASS_CMD_AUTO:
+ case SPEC_STORE_BYPASS_CMD_SECCOMP:
+ /*
+ * Choose prctl+seccomp as the default mode if seccomp is
+ * enabled.
+ */
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPEC_STORE_BYPASS_SECCOMP;
+ else
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_ON:
+ mode = SPEC_STORE_BYPASS_DISABLE;
+ break;
+ case SPEC_STORE_BYPASS_CMD_PRCTL:
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_NONE:
+ break;
+ }
+
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
+ * a completely different MSR and bit dependent on family.
+ */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ break;
+ case X86_VENDOR_AMD:
+ x86_amd_ssb_disable();
+ break;
+ }
+ }
+
+ return mode;
+}
+
+static void ssb_select_mitigation(void)
+{
+ ssb_mode = __ssb_select_mitigation();
+
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ bool update;
+
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ break;
+ default:
+ return -ERANGE;
+ }
+
+ /*
+ * If being set on non-current task, delay setting the CPU
+ * mitigation until it is next scheduled.
+ */
+ if (task == current && update)
+ speculative_store_bypass_update_current();
+
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ default:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
#ifdef CONFIG_SYSFS
-ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
{
- if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ if (!boot_cpu_has_bug(bug))
return sprintf(buf, "Not affected\n");
- if (boot_cpu_has(X86_FEATURE_PTI))
- return sprintf(buf, "Mitigation: PTI\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sprintf(buf, "Mitigation: PTI\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+ case X86_BUG_SPECTRE_V2:
+ return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ spectre_v2_module_string());
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ default:
+ break;
+ }
+
return sprintf(buf, "Vulnerable\n");
}
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
- return sprintf(buf, "Not affected\n");
- return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
}
ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- return sprintf(buf, "Not affected\n");
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
- return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- spectre_v2_module_string());
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8a5b185735e1..38276f58d3bf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -757,17 +757,32 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
* and they also have a different bit for STIBP support. Also,
* a hypervisor might have set the individual AMD bits even on
* Intel CPUs, for finer-grained selection of what's available.
- *
- * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
- * features, which are visible in /proc/cpuinfo and used by the
- * kernel. So set those accordingly from the Intel bits.
*/
if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
set_cpu_cap(c, X86_FEATURE_IBRS);
set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
}
+
if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
}
void get_cpu_cap(struct cpuinfo_x86 *c)
@@ -848,6 +863,11 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_power = edx;
}
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
@@ -871,7 +891,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c)
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
- c->x86_capability[CPUID_8000_0008_EBX] = ebx;
}
#ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -923,21 +942,47 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
{}
};
-static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
+/* Only list CPUs which speculate but are non susceptible to SSB */
+static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ { X86_VENDOR_AMD, 0x12, },
+ { X86_VENDOR_AMD, 0x11, },
+ { X86_VENDOR_AMD, 0x10, },
+ { X86_VENDOR_AMD, 0xf, },
+ {}
+};
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
- if (x86_match_cpu(cpu_no_meltdown))
- return false;
+ if (x86_match_cpu(cpu_no_speculation))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ if (x86_match_cpu(cpu_no_meltdown))
+ return;
+
/* Rogue Data Cache Load? No! */
if (ia32_cap & ARCH_CAP_RDCL_NO)
- return false;
+ return;
- return true;
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
}
/*
@@ -988,12 +1033,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
- if (!x86_match_cpu(cpu_no_speculation)) {
- if (cpu_vulnerable_to_meltdown(c))
- setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
- setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- }
+ cpu_set_bug_bits(c);
fpu__init_system(c);
@@ -1355,6 +1395,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
#endif
mtrr_ap_init();
validate_apic_and_package_id(c);
+ x86_spec_ctrl_setup_ap();
}
static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index e806b11a99af..37672d299e35 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -50,4 +50,6 @@ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
+extern void x86_spec_ctrl_setup_ap(void);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b9693b80fc21..577e7f7ae273 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -188,7 +188,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_IBPB);
setup_clear_cpu_cap(X86_FEATURE_STIBP);
setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
}
/*
@@ -835,6 +838,9 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
{ 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
+ { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
+ { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
{ 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f7666eef4a87..c8e038800591 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SMU] = { "smu", "System Management Unit" },
};
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+ [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
const char *smca_get_name(enum smca_bank_types t)
{
if (t >= N_SMCA_BANK_TYPES)
@@ -443,20 +448,26 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
+ /* Check our cache first: */
+ if (smca_bank_addrs[bank][block] != -1)
+ return smca_bank_addrs[bank][block];
+
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
- return addr;
+ goto out;
if (!(low & MCI_CONFIG_MCAX))
- return addr;
+ goto out;
if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
- return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+out:
+ smca_bank_addrs[bank][block] = addr;
return addr;
}
@@ -468,18 +479,6 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
return addr;
- /* Get address from already initialized block. */
- if (per_cpu(threshold_banks, cpu)) {
- struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank];
-
- if (bankp && bankp->blocks) {
- struct threshold_block *blockp = &bankp->blocks[block];
-
- if (blockp)
- return blockp->address;
- }
- }
-
if (mce_flags.smca)
return smca_get_block_address(cpu, bank, block);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 10c4fc2c91f8..77e201301528 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -564,14 +564,12 @@ static int __reload_late(void *info)
apply_microcode_local(&err);
spin_unlock(&update_lock);
+ /* siblings return UCODE_OK because their engine got updated already */
if (err > UCODE_NFOUND) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
- return -1;
- /* siblings return UCODE_OK because their engine got updated already */
+ ret = -1;
} else if (err == UCODE_UPDATED || err == UCODE_OK) {
ret = 1;
- } else {
- return ret;
}
/*
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 32b8e5724f96..1c2cfa0644aa 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -485,7 +485,6 @@ static void show_saved_mc(void)
*/
static void save_mc_for_early(u8 *mc, unsigned int size)
{
-#ifdef CONFIG_HOTPLUG_CPU
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
@@ -495,7 +494,6 @@ static void save_mc_for_early(u8 *mc, unsigned int size)
show_saved_mc();
mutex_unlock(&x86_cpu_microcode_mutex);
-#endif
}
static bool load_builtin_intel_microcode(struct cpio_data *cp)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0c408f8c4ed4..2d29e47c056e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -104,6 +104,12 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+/* Code in __startup_64() can be relocated during execution, but the compiler
+ * doesn't have to generate PC-relative relocations when accessing globals from
+ * that function. Clang actually does not generate them, which leads to
+ * boot-time crashes. To work around this problem, every global pointer must
+ * be adjusted using fixup_pointer().
+ */
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
@@ -113,6 +119,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
+ pteval_t *mask_ptr;
bool la57;
int i;
unsigned int *next_pgt_ptr;
@@ -196,7 +203,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
- pmd_entry &= __supported_pte_mask;
+ mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
+ pmd_entry &= *mask_ptr;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index fa183a131edc..a15fe0e92cf9 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL2.0
+// SPDX-License-Identifier: GPL-2.0
/*
* Jailhouse paravirt_ops implementation
*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3182908b7e6c..7326078eaa7a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -398,11 +398,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* little bit simple
*/
efi_map_sz = efi_get_runtime_map_size();
- efi_map_sz = ALIGN(efi_map_sz, 16);
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
- kbuf.bufsz = params_cmdline_sz + efi_map_sz +
+ kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
sizeof(struct setup_data) +
sizeof(struct efi_setup_data);
@@ -410,7 +409,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (!params)
return ERR_PTR(-ENOMEM);
efi_map_offset = params_cmdline_sz;
- efi_setup_data_offset = efi_map_offset + efi_map_sz;
+ efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
/* Copy setup header onto bootparams. Documentation/x86/boot.txt */
setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0715f827607c..6f4d42377fe5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return 0;
+
#ifdef CONFIG_X86_64
/* Only x86_64 has RIP relative instructions */
if (insn_rip_relative(insn)) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7867417cfaff..5b2300b818af 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)
static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
{
native_smp_prepare_cpus(max_cpus);
- if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
static_branch_disable(&virt_spin_lock_key);
}
@@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)
}
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
@@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
int cpu;
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
- if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
return;
__pv_init_lock_hash();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d41d896481b8..c9b14020f4dd 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -166,7 +166,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
*/
pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
/* Filter out unsuppored __PAGE_KERNEL* bits: */
- pgprot_val(pte_prot) |= __supported_pte_mask;
+ pgprot_val(pte_prot) &= __supported_pte_mask;
pte = pfn_pte(pfn, pte_prot);
set_pte_at(mm, va, ptep, pte);
pte_unmap_unlock(ptep, ptl);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 60cdec6628b0..d1ab07ec8c9a 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -57,12 +57,17 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
free_page((unsigned long)image->arch.pgd);
+ image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
+ image->arch.pmd0 = NULL;
free_page((unsigned long)image->arch.pmd1);
+ image->arch.pmd1 = NULL;
#endif
free_page((unsigned long)image->arch.pte0);
+ image->arch.pte0 = NULL;
free_page((unsigned long)image->arch.pte1);
+ image->arch.pte1 = NULL;
}
static int machine_kexec_alloc_page_tables(struct kimage *image)
@@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
!image->arch.pmd0 || !image->arch.pmd1 ||
#endif
!image->arch.pte0 || !image->arch.pte1) {
- machine_kexec_free_page_tables(image);
return -ENOMEM;
}
return 0;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a5e55d832d0a..6010449ca6d2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -39,9 +39,13 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.p4d);
+ image->arch.p4d = NULL;
free_page((unsigned long)image->arch.pud);
+ image->arch.pud = NULL;
free_page((unsigned long)image->arch.pmd);
+ image->arch.pmd = NULL;
free_page((unsigned long)image->arch.pte);
+ image->arch.pte = NULL;
}
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
@@ -91,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
return 0;
err:
- free_transition_pgtable(image);
return result;
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 77625b60a510..ab5d9dd668d2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -15,13 +15,11 @@
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
-static int forbid_dac __read_mostly;
+static bool disable_dac_quirk __read_mostly;
const struct dma_map_ops *dma_ops = &dma_direct_ops;
EXPORT_SYMBOL(dma_ops);
-static int iommu_sac_force __read_mostly;
-
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
int force_iommu __read_mostly = 1;
@@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = {
};
EXPORT_SYMBOL(x86_dma_fallback_dev);
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES 65536
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
@@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void)
}
}
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+bool arch_dma_alloc_attrs(struct device **dev)
{
if (!*dev)
*dev = &x86_dma_fallback_dev;
@@ -125,13 +120,13 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "nomerge", 7))
iommu_merge = 0;
if (!strncmp(p, "forcesac", 8))
- iommu_sac_force = 1;
+ pr_warn("forcesac option ignored.\n");
if (!strncmp(p, "allowdac", 8))
- forbid_dac = 0;
+ pr_warn("allowdac option ignored.\n");
if (!strncmp(p, "nodac", 5))
- forbid_dac = 1;
+ pr_warn("nodac option ignored.\n");
if (!strncmp(p, "usedac", 6)) {
- forbid_dac = -1;
+ disable_dac_quirk = true;
return 1;
}
#ifdef CONFIG_SWIOTLB
@@ -156,40 +151,9 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int arch_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
- if (mask > 0xffffffff && forbid_dac > 0) {
- dev_info(dev, "PCI: Disallowing DAC for device\n");
- return 0;
- }
-#endif
-
- /* Tell the device to use SAC when IOMMU force is on. This
- allows the driver to use cheaper accesses in some cases.
-
- Problem with this is that if we overflow the IOMMU area and
- return DAC as fallback address the device may not handle it
- correctly.
-
- As a special case some controllers have a 39bit address
- mode that is as efficient as 32bit (aic79xx). Don't force
- SAC for these. Assume all masks <= 40 bits are of this
- type. Normally this doesn't make any difference, but gives
- more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
- dev_info(dev, "Force SAC with mask %Lx\n", mask);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL(arch_dma_supported);
-
static int __init pci_iommu_init(void)
{
struct iommu_table_entry *p;
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
@@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.dma_32bit_limit = true;
+ return 0;
+}
+
static void via_no_dac(struct pci_dev *dev)
{
- if (forbid_dac == 0) {
+ if (!disable_dac_quirk) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
- forbid_dac = 1;
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
}
}
DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
deleted file mode 100644
index ac7ea3a8242f..000000000000
--- a/arch/x86/kernel/pci-nommu.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Fallback functions when the main IOMMU code is not compiled in. This
- code is roughly equivalent to i386. */
-#include <linux/dma-direct.h>
-#include <linux/scatterlist.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/pci.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-#define NOMMU_MAPPING_ERROR 0
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
- if (hwdev && !dma_capable(hwdev, bus, size)) {
- if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
- printk(KERN_ERR
- "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
- name, (long long)bus, size,
- (long long)*hwdev->dma_mask);
- return 0;
- }
- return 1;
-}
-
-static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
- WARN_ON(size == 0);
- if (!check_addr("map_single", dev, bus, size))
- return NOMMU_MAPPING_ERROR;
- return bus;
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA. This is the scatter-gather version of the
- * above pci_map_single interface. Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length. They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- * DMA address/length pairs than there are SG table elements.
- * (for example via virtual mapping capabilities)
- * The routine returns the number of addr/length pairs actually
- * used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *s;
- int i;
-
- WARN_ON(nents == 0 || sg[0].length == 0);
-
- for_each_sg(sg, s, nents, i) {
- BUG_ON(!sg_page(s));
- s->dma_address = sg_phys(s);
- if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
- return 0;
- s->dma_length = s->length;
- }
- return nents;
-}
-
-static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- return dma_addr == NOMMU_MAPPING_ERROR;
-}
-
-const struct dma_map_ops nommu_dma_ops = {
- .alloc = dma_generic_alloc_coherent,
- .free = dma_generic_free_coherent,
- .map_sg = nommu_map_sg,
- .map_page = nommu_map_page,
- .is_phys = 1,
- .mapping_error = nommu_mapping_error,
- .dma_supported = x86_dma_supported,
-};
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 03408b942adb..30ca2d1a9231 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,6 +38,7 @@
#include <asm/switch_to.h>
#include <asm/desc.h>
#include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -278,6 +279,148 @@ static inline void switch_to_bitmap(struct tss_struct *tss,
}
}
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+ struct ssb_state *shared_state;
+ raw_spinlock_t lock;
+ unsigned int disable_state;
+ unsigned long local_state;
+};
+
+#define LSTATE_SSB 0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ st->local_state = 0;
+
+ /*
+ * Shared state setup happens once on the first bringup
+ * of the CPU. It's not destroyed on CPU hotunplug.
+ */
+ if (st->shared_state)
+ return;
+
+ raw_spin_lock_init(&st->lock);
+
+ /*
+ * Go over HT siblings and check whether one of them has set up the
+ * shared state pointer already.
+ */
+ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+ if (cpu == this_cpu)
+ continue;
+
+ if (!per_cpu(ssb_state, cpu).shared_state)
+ continue;
+
+ /* Link it to the state of the sibling: */
+ st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+ return;
+ }
+
+ /*
+ * First HT sibling to come up on the core. Link shared state of
+ * the first HT sibling to itself. The siblings on the same core
+ * which come up later will see the shared state pointer and link
+ * themself to the state of this CPU.
+ */
+ st->shared_state = st;
+}
+
+/*
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
+ */
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ u64 msr = x86_amd_ls_cfg_base;
+
+ if (!static_cpu_has(X86_FEATURE_ZEN)) {
+ msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ return;
+ }
+
+ if (tifn & _TIF_SSBD) {
+ /*
+ * Since this can race with prctl(), block reentry on the
+ * same CPU.
+ */
+ if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ msr |= x86_amd_ls_cfg_ssbd_mask;
+
+ raw_spin_lock(&st->shared_state->lock);
+ /* First sibling enables SSBD: */
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ st->shared_state->disable_state++;
+ raw_spin_unlock(&st->shared_state->lock);
+ } else {
+ if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ raw_spin_lock(&st->shared_state->lock);
+ st->shared_state->disable_state--;
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ raw_spin_unlock(&st->shared_state->lock);
+ }
+}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+}
+#endif
+
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+ /*
+ * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+ * so ssbd_tif_to_spec_ctrl() just works.
+ */
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
+
+static __always_inline void intel_set_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
+
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+}
+
+static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+{
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ amd_set_ssb_virt_state(tifn);
+ else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ amd_set_core_ssb_state(tifn);
+ else
+ intel_set_ssb_state(tifn);
+}
+
+void speculative_store_bypass_update(unsigned long tif)
+{
+ preempt_disable();
+ __speculative_store_bypass_update(tif);
+ preempt_enable();
+}
+
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
@@ -309,6 +452,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOCPUID)
set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ if ((tifp ^ tifn) & _TIF_SSBD)
+ __speculative_store_bypass_update(tifn);
}
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4b100fe0f508..12bb445fb98d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -542,6 +542,7 @@ void set_personality_64bit(void)
clear_thread_flag(TIF_X32);
/* Pretend that this comes from a 64bit execve */
task_pt_regs(current)->orig_ax = __NR_execve;
+ current_thread_info()->status &= ~TS_COMPAT;
/* Ensure the corresponding mm is not marked. */
if (current->mm)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6285697b6e56..5c623dfe39d1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -50,6 +50,7 @@
#include <linux/init_ohci1394_dma.h>
#include <linux/kvm_para.h>
#include <linux/dma-contiguous.h>
+#include <xen/xen.h>
#include <linux/errno.h>
#include <linux/kernel.h>
@@ -534,6 +535,11 @@ static void __init reserve_crashkernel(void)
high = true;
}
+ if (xen_pv_domain()) {
+ pr_info("Ignoring crashkernel for a Xen PV domain\n");
+ return;
+ }
+
/* 0 means: find the address automatically */
if (crash_base <= 0) {
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ff99e2b6fc54..9dd324ae4832 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,6 +77,9 @@
#include <asm/i8259.h>
#include <asm/misc.h>
#include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
@@ -242,6 +245,8 @@ static void notrace start_secondary(void *unused)
*/
check_tsc_sync_target();
+ speculative_store_bypass_ht_init();
+
/*
* Lock vector_lock, set CPU online and bring the vector
* allocator online. Online must be set with vector_lock held
@@ -390,15 +395,47 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+/*
+ * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ *
+ * These are Intel CPUs that enumerate an LLC that is shared by
+ * multiple NUMA nodes. The LLC on these systems is shared for
+ * off-package data access but private to the NUMA node (half
+ * of the package) for on-package access.
+ *
+ * CPUID (the source of the information about the LLC) can only
+ * enumerate the cache as being shared *or* unshared, but not
+ * this particular configuration. The CPU in this case enumerates
+ * the cache to be shared across the entire package (spanning both
+ * NUMA nodes).
+ */
+
+static const struct x86_cpu_id snc_cpu[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+ {}
+};
+
static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
- if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
- return topology_sane(c, o, "llc");
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ return false;
- return false;
+ /* Do not match if LLC id does not match: */
+ if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ return false;
+
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+ return false;
+
+ return topology_sane(c, o, "llc");
}
/*
@@ -456,7 +493,8 @@ static struct sched_domain_topology_level x86_topology[] = {
/*
* Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours and Intel Cluster-on-Die have this.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
*/
static bool x86_has_numa_in_package;
@@ -1257,6 +1295,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_mtrr_aps_delayed_init();
smp_quirk_init_udelay();
+
+ speculative_store_bypass_ht_init();
}
void arch_enable_nonboot_cpus_begin(void)
@@ -1536,6 +1576,8 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
int i;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
if (!this_cpu_has(X86_FEATURE_MWAIT))
return;
if (!this_cpu_has(X86_FEATURE_CLFLUSH))
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ef32297ff17e..74392d9d51e0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -317,7 +317,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
hpet2 -= hpet1;
tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
do_div(tmp, 1000000);
- do_div(deltatsc, tmp);
+ deltatsc = div64_u64(deltatsc, tmp);
return (unsigned long) deltatsc;
}
@@ -1067,6 +1067,7 @@ static struct clocksource clocksource_tsc_early = {
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
.tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
};
/*
@@ -1086,6 +1087,7 @@ static struct clocksource clocksource_tsc = {
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
.tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc.list),
};
void mark_tsc_unstable(char *reason)
@@ -1098,13 +1100,9 @@ void mark_tsc_unstable(char *reason)
clear_sched_clock_stable();
disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult) {
- clocksource_mark_unstable(&clocksource_tsc);
- } else {
- clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
- clocksource_tsc.rating = 0;
- }
+
+ clocksource_mark_unstable(&clocksource_tsc_early);
+ clocksource_mark_unstable(&clocksource_tsc);
}
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
@@ -1244,7 +1242,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Don't bother refining TSC on unstable systems */
if (tsc_unstable)
- return;
+ goto unreg;
/*
* Since the work is started early in boot, we may be
@@ -1297,11 +1295,12 @@ static void tsc_refine_calibration_work(struct work_struct *work)
out:
if (tsc_unstable)
- return;
+ goto unreg;
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
clocksource_unregister(&clocksource_tsc_early);
}
@@ -1311,8 +1310,8 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
return 0;
- if (check_tsc_unstable())
- return 0;
+ if (tsc_unstable)
+ goto unreg;
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
@@ -1328,6 +1327,7 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
clocksource_unregister(&clocksource_tsc_early);
return 0;
}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 85c7ef23d99f..c84bb5396958 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -299,6 +299,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
if (is_prefix_bad(insn))
return -ENOTSUPP;
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
if (x86_64)
good_insns = good_insns_64;
else
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b90a8b3..92bf2f2e7cdd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(IBPB) | F(IBRS);
+ F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -408,7 +408,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
- F(ARCH_CAPABILITIES);
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
cpuid_mask(&entry->edx, CPUID_7_EDX);
+ /*
+ * We emulate ARCH_CAPABILITIES in software even
+ * if the host doesn't support it.
+ */
+ entry->edx |= F(ARCH_CAPABILITIES);
} else {
entry->ebx = 0;
entry->ecx = 0;
@@ -647,13 +652,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
- /* IBRS and IBPB aren't necessarily present in hardware cpuid */
- if (boot_cpu_has(X86_FEATURE_IBPB))
- entry->ebx |= F(IBPB);
- if (boot_cpu_has(X86_FEATURE_IBRS))
- entry->ebx |= F(IBRS);
+ /*
+ * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+ * hardware cpuid
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ entry->ebx |= F(AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ entry->ebx |= F(AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
break;
}
case 0x80000019:
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 98618e397342..46ff64da44ca 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1260,12 +1260,16 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
}
}
-static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
- struct kvm_run *run = vcpu->run;
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
- kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
- return 1;
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
}
static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
@@ -1296,8 +1300,10 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
if (param & ~KVM_HYPERV_CONN_ID_MASK)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- /* conn_to_evt is protected by vcpu->kvm->srcu */
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1348,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
/* Hypercall continuation is not supported yet */
if (rep_cnt || rep_idx) {
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
- goto set_result;
+ goto out;
}
switch (code) {
@@ -1379,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
break;
}
-set_result:
- kvm_hv_hypercall_set_result(vcpu, ret);
- return 1;
+out:
+ return kvm_hv_hypercall_complete(vcpu, ret);
}
void kvm_hv_init_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 70dcb5548022..3773c4625114 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1463,23 +1463,6 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_restore(flags);
}
-static void start_sw_period(struct kvm_lapic *apic)
-{
- if (!apic->lapic_timer.period)
- return;
-
- if (apic_lvtt_oneshot(apic) &&
- ktime_after(ktime_get(),
- apic->lapic_timer.target_expiration)) {
- apic_timer_expired(apic);
- return;
- }
-
- hrtimer_start(&apic->lapic_timer.timer,
- apic->lapic_timer.target_expiration,
- HRTIMER_MODE_ABS_PINNED);
-}
-
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
{
ktime_t now, remaining;
@@ -1539,11 +1522,43 @@ static bool set_target_expiration(struct kvm_lapic *apic)
static void advance_periodic_target_expiration(struct kvm_lapic *apic)
{
- apic->lapic_timer.tscdeadline +=
- nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
+
+ /*
+ * Synchronize both deadlines to the same time source or
+ * differences in the periods (caused by differences in the
+ * underlying clocks or numerical approximation errors) will
+ * cause the two to drift apart over time as the errors
+ * accumulate.
+ */
apic->lapic_timer.target_expiration =
ktime_add_ns(apic->lapic_timer.target_expiration,
apic->lapic_timer.period);
+ delta = ktime_sub(apic->lapic_timer.target_expiration, now);
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_PINNED);
}
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b58787daf9f8..26110c202b19 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,7 @@
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -213,6 +213,12 @@ struct vcpu_svm {
} host;
u64 spec_ctrl;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
u32 *msrpm;
@@ -1423,12 +1429,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu))
+ return svm->nested.hsave->control.tsc_offset;
+
+ return vcpu->arch.tsc_offset;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
if (is_guest_mode(vcpu)) {
+ /* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
@@ -2049,6 +2066,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.microcode_version = 0x01000065;
svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
@@ -3322,6 +3340,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
/* Restore the original control entries */
copy_vmcb_control_area(vmcb, hsave);
+ svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
@@ -3482,10 +3501,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
/* We don't want to see VMMCALLs from a nested guest */
clr_intercept(svm, INTERCEPT_VMMCALL);
+ svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
@@ -4035,12 +4056,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct vcpu_svm *svm = to_svm(vcpu);
switch (msr_info->index) {
- case MSR_IA32_TSC: {
- msr_info->data = svm->vmcb->control.tsc_offset +
- kvm_scale_tsc(vcpu, rdtsc());
-
- break;
- }
case MSR_STAR:
msr_info->data = svm->vmcb->save.star;
break;
@@ -4100,11 +4115,18 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
return 1;
msr_info->data = svm->spec_ctrl;
break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
case MSR_F15H_IC_CFG: {
int family, model;
@@ -4193,12 +4215,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->vmcb->save.g_pat = data;
mark_dirty(svm->vmcb, VMCB_NPT);
break;
- case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr);
- break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
@@ -4225,7 +4244,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
return 1;
if (data & ~PRED_CMD_IBPB)
@@ -4239,6 +4258,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
case MSR_STAR:
svm->vmcb->save.star = data;
break;
@@ -5265,9 +5294,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
}
if (!ret && svm) {
- trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
- host_irq, e->gsi,
- vcpu_info.vector,
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
vcpu_info.pi_desc_addr, set);
}
@@ -5553,8 +5581,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
* is no need to worry about the conditional branch over the wrmsr
* being speculatively taken.
*/
- if (svm->spec_ctrl)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
asm volatile (
"push %%" _ASM_BP "; \n\t"
@@ -5648,6 +5675,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+ loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
/*
* We do not use IBRS in the kernel. If this vCPU has used the
* SPEC_CTRL MSR it may have left it on; save the value and
@@ -5666,20 +5705,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- if (svm->spec_ctrl)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
-
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
reload_tss(vcpu);
@@ -5782,7 +5808,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static bool svm_has_high_real_mode_segbase(void)
+static bool svm_has_emulated_msr(int index)
{
return true;
}
@@ -7008,7 +7034,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
- .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
+ .has_emulated_msr = svm_has_emulated_msr,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
@@ -7102,6 +7128,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
+ .read_l1_tsc_offset = svm_read_l1_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aafcc9881e88..40aa29204baf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,7 @@
#include <asm/apic.h>
#include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
#include <asm/mshyperv.h>
#include "trace.h"
@@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void)
SECONDARY_EXEC_ENABLE_VMFUNC;
}
+static bool vmx_umip_emulated(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_DESC;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -2880,18 +2886,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx_update_msr_bitmap(&vmx->vcpu);
}
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
- u64 host_tsc, tsc_offset;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- host_tsc = rdtsc();
- tsc_offset = vmcs_read64(TSC_OFFSET);
- return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+ if (is_guest_mode(vcpu) &&
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+ return vcpu->arch.tsc_offset;
}
/*
@@ -3524,12 +3527,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
- case MSR_IA32_TSC:
- msr_info->data = guest_read_tsc(vcpu);
- break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
@@ -3646,17 +3645,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
- case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr_info);
- break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
vmx->spec_ctrl = data;
@@ -3682,7 +3677,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
@@ -4553,12 +4547,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
}
-static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
-{
- if (enable_ept)
- vmx_flush_tlb(vcpu, true);
-}
-
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -4776,14 +4764,16 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
- if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_DESC);
- hw_cr4 &= ~X86_CR4_UMIP;
- } else if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (cr4 & X86_CR4_UMIP) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ }
if (cr4 & X86_CR4_VMXE) {
/*
@@ -9287,7 +9277,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
} else {
sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmx_flush_tlb_ept_only(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
@@ -9315,7 +9305,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
!nested_cpu_has2(get_vmcs12(&vmx->vcpu),
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
vmcs_write64(APIC_ACCESS_ADDR, hpa);
- vmx_flush_tlb_ept_only(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
}
@@ -9495,9 +9485,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
}
STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
{
- return enable_unrestricted_guest || emulate_invalid_guest_state;
+ switch (index) {
+ case MSR_IA32_SMBASE:
+ /*
+ * We cannot do SMM unless we can run the guest in big
+ * real mode.
+ */
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ /* This is AMD only. */
+ return false;
+ default:
+ return true;
+ }
}
static bool vmx_mpx_supported(void)
@@ -9512,12 +9514,6 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
}
-static bool vmx_umip_emulated(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_DESC;
-}
-
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -9735,8 +9731,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* is no need to worry about the conditional branch over the wrmsr
* being speculatively taken.
*/
- if (vmx->spec_ctrl)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
vmx->__launched = vmx->loaded_vmcs->launched;
@@ -9884,8 +9879,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- if (vmx->spec_ctrl)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
@@ -10608,6 +10602,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return true;
}
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ !page_address_valid(vcpu, vmcs12->apic_access_addr))
+ return -EINVAL;
+ else
+ return 0;
+}
+
static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -11176,11 +11180,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
- vmcs_write64(TSC_OFFSET,
- vcpu->arch.tsc_offset + vmcs12->tsc_offset);
- else
- vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -11222,7 +11223,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
} else if (nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- vmx_flush_tlb_ept_only(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
/*
@@ -11299,6 +11300,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -11420,6 +11424,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 msr_entry_idx;
u32 exit_qual;
+ int r;
enter_guest_mode(vcpu);
@@ -11429,26 +11434,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
- if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
- leave_guest_mode(vcpu);
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, exit_qual);
- return 1;
- }
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+ r = EXIT_REASON_INVALID_STATE;
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+ goto fail;
nested_get_vmcs12_pages(vcpu, vmcs12);
+ r = EXIT_REASON_MSR_LOAD_FAIL;
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
- if (msr_entry_idx) {
- leave_guest_mode(vcpu);
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
- return 1;
- }
+ if (msr_entry_idx)
+ goto fail;
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11457,6 +11457,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
return 0;
+
+fail:
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ leave_guest_mode(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+ return 1;
}
/*
@@ -12028,6 +12036,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
if (likely(!vmx->fail)) {
if (exit_reason == -1)
sync_vmcs12(vcpu, vmcs12);
@@ -12065,7 +12076,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
} else if (!nested_cpu_has_ept(vmcs12) &&
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- vmx_flush_tlb_ept_only(vcpu);
+ vmx_flush_tlb(vcpu, true);
}
/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -12224,10 +12235,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 tscl = rdtsc();
- u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
- u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ struct vcpu_vmx *vmx;
+ u64 tscl, guest_tscl, delta_tsc;
+
+ if (kvm_mwait_in_guest(vcpu->kvm))
+ return -EOPNOTSUPP;
+
+ vmx = to_vmx(vcpu);
+ tscl = rdtsc();
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -12533,7 +12550,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
- trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
@@ -12622,7 +12639,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
- .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+ .has_emulated_msr = vmx_has_emulated_msr,
.vm_init = vmx_vm_init,
.vm_alloc = vmx_vm_alloc,
@@ -12712,6 +12729,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+ .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2ff74b12ec4..71e7cda6d014 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
static bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
-unsigned int min_timer_period_us = 500;
+unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
@@ -843,7 +843,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
#ifdef CONFIG_X86_64
- cr3 &= ~CR3_PCID_INVD;
+ bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ if (pcid_enabled)
+ cr3 &= ~CR3_PCID_INVD;
#endif
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
@@ -1058,6 +1061,7 @@ static u32 emulated_msrs[] = {
MSR_SMI_COUNT,
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
};
static unsigned num_emulated_msrs;
@@ -1490,7 +1494,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
- u64 curr_offset = vcpu->arch.tsc_offset;
+ u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
@@ -1532,7 +1536,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+ return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
@@ -2362,6 +2368,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.smbase = data;
break;
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
return 1;
@@ -2605,6 +2614,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_TSC:
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2819,7 +2831,8 @@ out:
static inline bool kvm_can_mwait_in_guest(void)
{
return boot_cpu_has(X86_FEATURE_MWAIT) &&
- !boot_cpu_has_bug(X86_BUG_MONITOR);
+ !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+ boot_cpu_has(X86_FEATURE_ARAT);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -2894,7 +2907,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+ r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
@@ -4594,14 +4607,8 @@ static void kvm_init_msr_list(void)
num_msrs_to_save = j;
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
- switch (emulated_msrs[i]) {
- case MSR_IA32_SMBASE:
- if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
- continue;
- break;
- default:
- break;
- }
+ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ continue;
if (j < i)
emulated_msrs[j] = emulated_msrs[i];
@@ -6662,9 +6669,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit, r;
-
- r = kvm_skip_emulated_instruction(vcpu);
+ int op_64_bit;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -6712,8 +6717,9 @@ out:
if (!op_64_bit)
ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+
++vcpu->stat.hypercalls;
- return r;
+ return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -7970,6 +7976,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
+ int cpuid_update_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
int ret = -EINVAL;
@@ -8008,8 +8015,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
+ (X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ if (cpuid_update_needed)
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7d35ce672989..c9492f764902 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -302,13 +302,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
-#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
-#define KVM_X86_DISABLE_EXITS_HTL (1 << 1)
-#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
-#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \
- KVM_X86_DISABLE_EXITS_HTL | \
- KVM_X86_DISABLE_EXITS_PAUSE)
-
static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
return kvm->arch.mwait_in_guest;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 62a7e9f65dec..cc7ff5957194 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
+#include <linux/highmem.h>
#include <asm/pgtable.h>
@@ -334,16 +335,16 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
pgprotval_t eff_in, unsigned long P)
{
int i;
- pte_t *start;
+ pte_t *pte;
pgprotval_t prot, eff;
- start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
- prot = pte_flags(*start);
- eff = effective_prot(eff_in, prot);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+ pte = pte_offset_map(&addr, st->current_address);
+ prot = pte_flags(*pte);
+ eff = effective_prot(eff_in, prot);
note_page(m, st, __pgprot(prot), eff, 5);
- start++;
+ pte_unmap(pte);
}
}
#ifdef CONFIG_KASAN
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0f3d50f4c48c..3bded76e8d5c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -93,6 +93,18 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { }
#endif
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr <= end;
+}
+
#ifdef CONFIG_X86_64
static inline unsigned long highmap_start_pfn(void)
@@ -106,20 +118,25 @@ static inline unsigned long highmap_end_pfn(void)
return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}
-#endif
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
- return addr >= start && addr < end;
+ /*
+ * Kernel text has an alias mapping at a high address, known
+ * here as "highmap".
+ */
+ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
}
-static inline int
-within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+#else
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
- return addr >= start && addr <= end;
+ /* There is no highmap on 32-bit */
+ return false;
}
+#endif
+
/*
* Flushing functions
*/
@@ -172,7 +189,7 @@ static void __cpa_flush_all(void *arg)
static void cpa_flush_all(unsigned long cache)
{
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
@@ -236,7 +253,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
#endif
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
@@ -1183,6 +1200,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
cpa->numpages = 1;
cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
return 0;
+
+ } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
+ /* Faults in the highmap are OK, so do not warn: */
+ return -EFAULT;
} else {
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
@@ -1335,8 +1356,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* to touch the high mapped kernel as well:
*/
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
- within_inclusive(cpa->pfn, highmap_start_pfn(),
- highmap_end_pfn())) {
+ __cpa_pfn_in_highmap(cpa->pfn)) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0eea20a5..6e98e0a7c923 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
*/
if (pkey != -1)
return pkey;
- /*
- * Look for a protection-key-drive execute-only mapping
- * which is now being given permissions that are not
- * execute-only. Move it back to the default pkey.
- */
- if (vma_is_pkey_exec_only(vma) &&
- (prot & (PROT_READ|PROT_WRITE))) {
- return 0;
- }
+
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
- * support.
+ * support in this mm.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
+ } else if (vma_is_pkey_exec_only(vma)) {
+ /*
+ * Protections are *not* PROT_EXEC, but the mapping
+ * is using the exec-only pkey. This mapping was
+ * PROT_EXEC and will no longer be. Move back to
+ * the default pkey.
+ */
+ return ARCH_DEFAULT_PKEY;
}
+
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index f1fd52f449e0..4d418e705878 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -421,6 +421,16 @@ static inline bool pti_kernel_image_global_ok(void)
if (boot_cpu_has(X86_FEATURE_K8))
return false;
+ /*
+ * RANDSTRUCT derives its hardening benefits from the
+ * attacker's lack of knowledge about the layout of kernel
+ * data structures. Keep the kernel image non-global in
+ * cases where RANDSTRUCT is in use to help keep the layout a
+ * secret.
+ */
+ if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ return false;
+
return true;
}
@@ -430,12 +440,24 @@ static inline bool pti_kernel_image_global_ok(void)
*/
void pti_clone_kernel_text(void)
{
+ /*
+ * rodata is part of the kernel image and is normally
+ * readable on the filesystem or on the web. But, do not
+ * clone the areas past rodata, they might contain secrets.
+ */
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+ unsigned long end = (unsigned long)__end_rodata_hpage_align;
if (!pti_kernel_image_global_ok())
return;
+ pr_debug("mapping partial kernel image into user address space\n");
+
+ /*
+ * Note that this will undo _some_ of the work that
+ * pti_set_kernel_image_nonglobal() did to clear the
+ * global bit.
+ */
pti_clone_pmds(start, end, _PAGE_RW);
}
@@ -458,8 +480,6 @@ void pti_set_kernel_image_nonglobal(void)
if (pti_kernel_image_global_ok())
return;
- pr_debug("set kernel image non-global\n");
-
set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b725154182cc..263c8453815e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1027,7 +1027,17 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
break;
case BPF_JMP | BPF_JA:
- jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+
if (!jmp_offset)
/* optimize out nop jumps */
break;
@@ -1226,6 +1236,7 @@ skip_init_addrs:
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
if (proglen <= 0) {
+out_image:
image = NULL;
if (header)
bpf_jit_binary_free(header);
@@ -1236,8 +1247,7 @@ skip_init_addrs:
if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen);
- prog = orig_prog;
- goto out_addrs;
+ goto out_image;
}
break;
}
@@ -1273,7 +1283,7 @@ skip_init_addrs:
prog = orig_prog;
}
- if (!prog->is_func || extra_pass) {
+ if (!image || !prog->is_func || extra_pass) {
out_addrs:
kfree(addrs);
kfree(jit_data);
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 48b14b534897..ccf4a49bb065 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -98,7 +98,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
} else {
/* No p4d for 4-level paging: point the pgd to the pud page table */
- pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot));
+ pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot));
set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
}
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 826898701045..19c1ff542387 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -65,6 +65,19 @@ static void __init xen_hvm_init_mem_mapping(void)
{
early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+
+ /*
+ * The virtual address of the shared_info page has changed, so
+ * the vcpu_info pointer for VCPU 0 is now stale.
+ *
+ * The prepare_boot_cpu callback will re-initialize it via
+ * xen_vcpu_setup, but we can't rely on that to be called for
+ * old Xen versions (xen_have_vector_callback == 0).
+ *
+ * It is, in any case, bad to have a stale vcpu_info pointer
+ * so reset it now.
+ */
+ xen_vcpu_info_reset(0);
}
static void __init init_hvm_pv_info(void)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c36d23aa6c35..357969a3697c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -421,45 +421,33 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
- unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
- unsigned long frames[pages];
- int f;
-
- /*
- * A GDT can be up to 64k in size, which corresponds to 8192
- * 8-byte entries, or 16 4k pages..
- */
+ unsigned long pfn, mfn;
+ int level;
+ pte_t *ptep;
+ void *virt;
- BUG_ON(size > 65536);
+ /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+ BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- int level;
- pte_t *ptep;
- unsigned long pfn, mfn;
- void *virt;
-
- /*
- * The GDT is per-cpu and is in the percpu data area.
- * That can be virtually mapped, so we need to do a
- * page-walk to get the underlying MFN for the
- * hypercall. The page can also be in the kernel's
- * linear range, so we need to RO that mapping too.
- */
- ptep = lookup_address(va, &level);
- BUG_ON(ptep == NULL);
-
- pfn = pte_pfn(*ptep);
- mfn = pfn_to_mfn(pfn);
- virt = __va(PFN_PHYS(pfn));
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
+ BUG_ON(ptep == NULL);
- frames[f] = mfn;
+ pfn = pte_pfn(*ptep);
+ mfn = pfn_to_mfn(pfn);
+ virt = __va(PFN_PHYS(pfn));
- make_lowmem_page_readonly((void *)va);
- make_lowmem_page_readonly(virt);
- }
+ make_lowmem_page_readonly((void *)va);
+ make_lowmem_page_readonly(virt);
- if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
BUG();
}
@@ -470,34 +458,22 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
- unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
- unsigned long frames[pages];
- int f;
-
- /*
- * A GDT can be up to 64k in size, which corresponds to 8192
- * 8-byte entries, or 16 4k pages..
- */
+ unsigned long pfn, mfn;
+ pte_t pte;
- BUG_ON(size > 65536);
+ /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+ BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- pte_t pte;
- unsigned long pfn, mfn;
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
- pfn = virt_to_pfn(va);
- mfn = pfn_to_mfn(pfn);
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
- pte = pfn_pte(pfn, PAGE_KERNEL_RO);
-
- if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
- BUG();
-
- frames[f] = mfn;
- }
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
- if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
BUG();
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d33e7dbe3129..2d76106788a3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,13 +42,11 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
}
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
-static void xen_flush_tlb_all(void)
+static noinline void xen_flush_tlb_all(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb_all(0);
-
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 486c0a34d00b..2c30cabfda90 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1310,13 +1310,11 @@ unsigned long xen_read_cr2_direct(void)
return this_cpu_read(xen_vcpu_info.arch.cr2);
}
-static void xen_flush_tlb(void)
+static noinline void xen_flush_tlb(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb(0);
-
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
OpenPOWER on IntegriCloud