summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig41
-rw-r--r--arch/x86/Kconfig.debug12
-rw-r--r--arch/x86/boot/compressed/eboot.c4
-rw-r--r--arch/x86/configs/xen.config28
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c2
-rw-r--r--arch/x86/entry/entry_64.S299
-rw-r--r--arch/x86/include/asm/Kbuild2
-rw-r--r--arch/x86/include/asm/cacheflush.h72
-rw-r--r--arch/x86/include/asm/edac.h2
-rw-r--r--arch/x86/include/asm/espfix.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h72
-rw-r--r--arch/x86/include/asm/hugetlb.h12
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h55
-rw-r--r--arch/x86/include/asm/io.h6
-rw-r--r--arch/x86/include/asm/kasan.h8
-rw-r--r--arch/x86/include/asm/kvm_emulate.h9
-rw-r--r--arch/x86/include/asm/kvm_host.h96
-rw-r--r--arch/x86/include/asm/livepatch.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/pvclock-abi.h1
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/serial.h2
-rw-r--r--arch/x86/include/asm/setup.h7
-rw-r--r--arch/x86/include/uapi/asm/e820.h1
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h13
-rw-r--r--arch/x86/include/uapi/asm/kvm.h14
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S6
-rw-r--r--arch/x86/kernel/apic/vector.c10
-rw-r--r--arch/x86/kernel/bootflag.c2
-rw-r--r--arch/x86/kernel/check.c3
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c3
-rw-r--r--arch/x86/kernel/devicetree.c2
-rw-r--r--arch/x86/kernel/e820.c31
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/espfix_64.c28
-rw-r--r--arch/x86/kernel/fpu/init.c47
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/irq.c20
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c11
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/kvmclock.c14
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/nmi.c123
-rw-r--r--arch/x86/kernel/pmem.c93
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/setup.c12
-rw-r--r--arch/x86/kernel/smpboot.c38
-rw-r--r--arch/x86/kernel/tsc.c11
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/Kconfig9
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/cpuid.c15
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c303
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/ioapic.c9
-rw-r--r--arch/x86/kvm/iommu.c2
-rw-r--r--arch/x86/kvm/irq_comm.c14
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/lapic.c76
-rw-r--r--arch/x86/kvm/lapic.h16
-rw-r--r--arch/x86/kvm/mmu.c688
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu_audit.c22
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/mtrr.c719
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/pmu.c553
-rw-r--r--arch/x86/kvm/pmu.h118
-rw-r--r--arch/x86/kvm/pmu_amd.c207
-rw-r--r--arch/x86/kvm/pmu_intel.c358
-rw-r--r--arch/x86/kvm/svm.c220
-rw-r--r--arch/x86/kvm/trace.h22
-rw-r--r--arch/x86/kvm/vmx.c367
-rw-r--r--arch/x86/kvm/x86.c920
-rw-r--r--arch/x86/kvm/x86.h13
-rw-r--r--arch/x86/lib/usercopy.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c47
-rw-r--r--arch/x86/net/bpf_jit_comp.c150
-rw-r--r--arch/x86/platform/efi/efi.c24
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c3
-rw-r--r--arch/x86/platform/uv/uv_nmi.c2
-rw-r--r--arch/x86/um/asm/checksum.h1
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/asm/processor.h2
-rw-r--r--arch/x86/um/asm/segment.h8
-rw-r--r--arch/x86/um/ldt.c1
-rw-r--r--arch/x86/um/mem_32.c3
-rw-r--r--arch/x86/um/mem_64.c3
-rw-r--r--arch/x86/um/ptrace_32.c1
-rw-r--r--arch/x86/um/ptrace_64.c1
-rw-r--r--arch/x86/um/shared/sysdep/tls.h6
-rw-r--r--arch/x86/um/signal.c3
-rw-r--r--arch/x86/um/syscalls_64.c1
-rw-r--r--arch/x86/um/tls_32.c1
-rw-r--r--arch/x86/um/tls_64.c1
-rw-r--r--arch/x86/um/vdso/vma.c1
108 files changed, 4490 insertions, 1813 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7e39f9b22705..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,18 +27,21 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_PMEM_API
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -50,6 +53,8 @@ config X86
select CLONE_BACKWARDS if X86_32
select COMPAT_OLD_SIGACTION if IA32_EMULATION
select DCACHE_WORD_ACCESS
+ select EDAC_ATOMIC_SCRUB
+ select EDAC_SUPPORT
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
select GENERIC_CLOCKEVENTS_MIN_ADJUST
@@ -84,6 +89,7 @@ config X86
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_CONTEXT_TRACKING if X86_64
+ select HAVE_COPY_THREAD_TLS
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
@@ -249,6 +255,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config KASAN_SHADOW_OFFSET
+ hex
+ depends on KASAN
+ default 0xdffffc0000000000
+
config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
@@ -1053,24 +1064,19 @@ config TOSHIBA
Say N otherwise.
config I8K
- tristate "Dell laptop support"
+ tristate "Dell i8k legacy laptop support"
select HWMON
+ select SENSORS_DELL_SMM
---help---
- This adds a driver to safely access the System Management Mode
- of the CPU on the Dell Inspiron 8000. The System Management Mode
- is used to read cpu temperature and cooling fan status and to
- control the fans on the I8K portables.
-
- This driver has been tested only on the Inspiron 8000 but it may
- also work with other Dell laptops. You can force loading on other
- models by passing the parameter `force=1' to the module. Use at
- your own risk.
-
- For information on utilities to make use of this driver see the
- I8K Linux utilities web site at:
- <http://people.debian.org/~dz/i8k/>
+ This option enables legacy /proc/i8k userspace interface in hwmon
+ dell-smm-hwmon driver. Character file /proc/i8k reports bios version,
+ temperature and allows controlling fan speeds of Dell laptops via
+ System Management Mode. For old Dell laptops (like Dell Inspiron 8000)
+ it reports also power and hotkey status. For fan speed control is
+ needed userspace package i8kutils.
- Say Y if you intend to run this kernel on a Dell Inspiron 8000.
+ Say Y if you intend to run this kernel on old Dell laptops or want to
+ use userspace package i8kutils.
Say N otherwise.
config X86_REBOOTFIXUPS
@@ -1422,6 +1428,9 @@ source "mm/Kconfig"
config X86_PMEM_LEGACY
bool "Support non-standard NVDIMMs and ADR protected memory"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
by the Intel Sandy Bridge-EP reference BIOS as protected memory.
@@ -2012,7 +2021,7 @@ config CMDLINE_BOOL
To compile command line arguments into the kernel,
set this option to 'Y', then fill in the
- the boot arguments in CONFIG_CMDLINE.
+ boot arguments in CONFIG_CMDLINE.
Systems with fully functional boot loaders (i.e. non-embedded)
should leave this option set to 'N'.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a15893d17c55..d8c0d3266173 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
If unsure, say N.
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
+ depends on DEBUG_KERNEL
+ ---help---
+ This option enables sanity checks in x86's low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
+
+ This is currently used to help test NMI code.
+
+ If unsure, say N.
+
config DEBUG_NMI_SELFTEST
bool "NMI Selftest"
depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 48304b89b601..2c82bd150d43 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params,
e820_type = E820_NVS;
break;
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_PMEM;
+ break;
+
default:
continue;
}
diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config
new file mode 100644
index 000000000000..d9fc7139fd46
--- /dev/null
+++ b/arch/x86/configs/xen.config
@@ -0,0 +1,28 @@
+# global x86 required specific stuff
+# On 32-bit HIGHMEM4G is not allowed
+CONFIG_HIGHMEM64G=y
+CONFIG_64BIT=y
+
+# These enable us to allow some of the
+# not so generic stuff below
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_X86_MCE=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_CPU_FREQ=y
+
+# x86 xen specific config options
+CONFIG_XEN_PVH=y
+CONFIG_XEN_MAX_DOMAIN_MEMORY=500
+CONFIG_XEN_SAVE_RESTORE=y
+# CONFIG_XEN_DEBUG_FS is not set
+CONFIG_XEN_MCE_LOG=y
+CONFIG_XEN_ACPI_PROCESSOR=m
+# x86 specific backend drivers
+CONFIG_XEN_PCIDEV_BACKEND=m
+# x86 specific frontend drivers
+CONFIG_XEN_PCIDEV_FRONTEND=m
+# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
+# move to generic config if it ever does.
+CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2bfc8a7c88c1..dccad38b59a8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1537,7 +1537,7 @@ static void __exit aesni_exit(void)
crypto_fpu_exit();
}
-module_init(aesni_init);
+late_initcall(aesni_init);
module_exit(aesni_exit);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as our temp variable throughout */
pushq %rdx
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ */
+
+ SWAPGS
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+ jmp restore_c_regs_and_iret
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
- /* Ah, it is within the NMI stack, treat it as nested */
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -1*8(%rsp), %rdx
- movq %rdx, %rsp
+ subq $8, %rsp
leaq -10*8(%rsp), %rdx
pushq $__KERNEL_DS
pushq %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
nested_nmi_out:
popq %rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
- /* Set the NMI executing variable on the stack. */
- pushq $1
+ /* Make room for "NMI executing". */
+ pushq $0
- /* Leave room for the "copied" frame */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq 11*8(%rsp)
.endr
/* Everything up to here is safe from nested NMIs */
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * For ease of testing, unmask NMIs right away. Disabled by
+ * default because IRET is very expensive.
+ */
+ pushq $0 /* SS */
+ pushq %rsp /* RSP (minus 8 because of the previous push) */
+ addq $8, (%rsp) /* Fix up RSP */
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+ INTERRUPT_RETURN /* continues at repeat_nmi below */
+1:
+#endif
+
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi. "NMI executing"
+ * is zero.
*/
-repeat_nmi:
+ movq $1, 10*8(%rsp) /* Set "NMI executing". */
+
/*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
*/
- movq $1, 10*8(%rsp)
-
- /* Make another copy, this one may be modified by nested NMIs */
addq $(10*8), %rsp
.rept 5
pushq -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
*/
call paranoid_entry
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
nmi_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
- /* Pop the extra iret frame at once */
+
+ /* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
INTERRUPT_RETURN
END(nmi)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index d55a210a49bf..aeac434c9feb 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -9,4 +9,4 @@ generic-y += cputime.h
generic-y += dma-contiguous.h
generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
-generic-y += scatterlist.h
+generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b6f7457d12e4..9bf3ea14b9f0 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
/* Caches aren't brain-dead on the intel. */
#include <asm-generic/cacheflush.h>
#include <asm/special_insns.h>
+#include <asm/uaccess.h>
/*
* The set_memory_* API can be used to change various attributes of a virtual
@@ -108,4 +109,75 @@ static inline int rodata_test(void)
}
#endif
+#ifdef ARCH_HAS_NOCACHE_UACCESS
+
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t n)
+{
+ int unwritten;
+
+ /*
+ * We are copying between two kernel buffers, if
+ * __copy_from_user_inatomic_nocache() returns an error (page
+ * fault) we would have already reported a general protection fault
+ * before the WARN+BUG.
+ */
+ unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+ (void __user *) src, n);
+ if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+ __func__, dst, src, unwritten))
+ BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+ /*
+ * wmb() to 'sfence' all previous writes such that they are
+ * architecturally visible to 'pcommit'. Note, that we've
+ * already arranged for pmem writes to avoid the cache via
+ * arch_memcpy_to_pmem().
+ */
+ wmb();
+ pcommit_sfence();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * We require that wmb() be an 'sfence', that is only guaranteed on
+ * 64-bit builds
+ */
+ return static_cpu_has(X86_FEATURE_PCOMMIT);
+#else
+ return false;
+#endif
+}
+#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
+extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void arch_wmb_pmem(void);
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
index e9b57ecc70c5..cf8fdf83b231 100644
--- a/arch/x86/include/asm/edac.h
+++ b/arch/x86/include/asm/edac.h
@@ -3,7 +3,7 @@
/* ECC atomic, DMA, SMP and interrupt safe scrub function */
-static inline void atomic_scrub(void *va, u32 size)
+static inline void edac_atomic_scrub(void *va, u32 size)
{
u32 i, *virt_addr = va;
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb2f69d..ca3ce9ab9385 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
struct fxregs_state fxsave;
struct swregs_state soft;
struct xregs_state xsave;
+ u8 __padding[PAGE_SIZE];
};
/*
@@ -198,40 +199,6 @@ union fpregs_state {
*/
struct fpu {
/*
- * @state:
- *
- * In-memory copy of all FPU registers that we save/restore
- * over context switches. If the task is using the FPU then
- * the registers in the FPU are more recent than this state
- * copy. If the task context-switches away then they get
- * saved here and represent the FPU state.
- *
- * After context switches there may be a (short) time period
- * during which the in-FPU hardware registers are unchanged
- * and still perfectly match this state, if the tasks
- * scheduled afterwards are not using the FPU.
- *
- * This is the 'lazy restore' window of optimization, which
- * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
- *
- * We detect whether a subsequent task uses the FPU via setting
- * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
- *
- * During this window, if the task gets scheduled again, we
- * might be able to skip having to do a restore from this
- * memory buffer to the hardware registers - at the cost of
- * incurring the overhead of #NM fault traps.
- *
- * Note that on modern CPUs that support the XSAVEOPT (or other
- * optimized XSAVE instructions), we don't use #NM traps anymore,
- * as the hardware can track whether FPU registers need saving
- * or not. On such CPUs we activate the non-lazy ('eagerfpu')
- * logic, which unconditionally saves/restores all FPU state
- * across context switches. (if FPU state exists.)
- */
- union fpregs_state state;
-
- /*
* @last_cpu:
*
* Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
* deal with bursty apps that only use the FPU for a short time:
*/
unsigned char counter;
+ /*
+ * @state:
+ *
+ * In-memory copy of all FPU registers that we save/restore
+ * over context switches. If the task is using the FPU then
+ * the registers in the FPU are more recent than this state
+ * copy. If the task context-switches away then they get
+ * saved here and represent the FPU state.
+ *
+ * After context switches there may be a (short) time period
+ * during which the in-FPU hardware registers are unchanged
+ * and still perfectly match this state, if the tasks
+ * scheduled afterwards are not using the FPU.
+ *
+ * This is the 'lazy restore' window of optimization, which
+ * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+ *
+ * We detect whether a subsequent task uses the FPU via setting
+ * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+ *
+ * During this window, if the task gets scheduled again, we
+ * might be able to skip having to do a restore from this
+ * memory buffer to the hardware registers - at the cost of
+ * incurring the overhead of #NM fault traps.
+ *
+ * Note that on modern CPUs that support the XSAVEOPT (or other
+ * optimized XSAVE instructions), we don't use #NM traps anymore,
+ * as the hardware can track whether FPU registers need saving
+ * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+ * logic, which unconditionally saves/restores all FPU state
+ * across context switches. (if FPU state exists.)
+ */
+ union fpregs_state state;
+ /*
+ * WARNING: 'state' is dynamically-sized. Do not put
+ * anything after it here.
+ */
};
#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 68c05398bba9..f8a29d2c97b0 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor,
@@ -83,15 +80,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
new file mode 100644
index 000000000000..cd0310e186f4
--- /dev/null
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_INTEL_PMC_IPC_H_
+#define _ASM_X86_INTEL_PMC_IPC_H_
+
+/* Commands */
+#define PMC_IPC_PMIC_ACCESS 0xFF
+#define PMC_IPC_PMIC_ACCESS_READ 0x0
+#define PMC_IPC_PMIC_ACCESS_WRITE 0x1
+#define PMC_IPC_USB_PWR_CTRL 0xF0
+#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF
+#define PMC_IPC_PHY_CONFIG 0xEE
+#define PMC_IPC_NORTHPEAK_CTRL 0xED
+#define PMC_IPC_PM_DEBUG 0xEC
+#define PMC_IPC_PMC_TELEMTRY 0xEB
+#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA
+
+/* IPC return code */
+#define IPC_ERR_NONE 0
+#define IPC_ERR_CMD_NOT_SUPPORTED 1
+#define IPC_ERR_CMD_NOT_SERVICED 2
+#define IPC_ERR_UNABLE_TO_SERVICE 3
+#define IPC_ERR_CMD_INVALID 4
+#define IPC_ERR_CMD_FAILED 5
+#define IPC_ERR_EMSECURITY 6
+#define IPC_ERR_UNSIGNEDKERNEL 7
+
+#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
+
+int intel_pmc_ipc_simple_command(int cmd, int sub);
+int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen, u32 dptr, u32 sptr);
+int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen);
+
+#else
+
+static inline int intel_pmc_ipc_simple_command(int cmd, int sub)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen, u32 dptr, u32 sptr)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen)
+{
+ return -EINVAL;
+}
+
+#endif /*CONFIG_INTEL_PMC_IPC*/
+
+#endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 83ec9b1d77cc..cc9c61bc1abe 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -248,6 +248,12 @@ static inline void flush_write_buffers(void)
#endif
}
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
#endif /* __KERNEL__ */
extern void native_io_delay(void);
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 8b22422fbad8..74a2a8dc9908 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -14,15 +14,11 @@
#ifndef __ASSEMBLY__
-extern pte_t kasan_zero_pte[];
-extern pte_t kasan_zero_pmd[];
-extern pte_t kasan_zero_pud[];
-
#ifdef CONFIG_KASAN
-void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_early_init(void);
void __init kasan_init(void);
#else
-static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe160..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+ void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -262,6 +264,11 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
};
+/* These match some of the HF_* flags defined in kvm_host.h */
+#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
+
struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
@@ -273,8 +280,8 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ int emul_flags;
- bool guest_mode; /* guest running a nested guest */
bool perm_ok; /* do not check permissions if true */
bool ud; /* inject an #UD if host doesn't support insn */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8c0ec3a4a97..49ec9038ec14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -184,23 +184,12 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - direct mapping of virtual to physical mapping at gfn
- * used for real mode and two-dimensional paging
- * bits 17:19 - common access permissions for all ptes in this shadow page
- */
union kvm_mmu_page_role {
unsigned word;
struct {
unsigned level:4;
unsigned cr4_pae:1;
unsigned quadrant:2;
- unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
@@ -208,6 +197,15 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
+ unsigned :8;
+
+ /*
+ * This is left at the top of the word so that
+ * kvm_memslots_for_spte_role can extract it with a
+ * simple shift. While there is room, give it a whole
+ * byte so it is also faster to load it from memory.
+ */
+ unsigned smm:8;
};
};
@@ -338,12 +336,28 @@ struct kvm_pmu {
u64 reprogram_pmi;
};
+struct kvm_pmu_ops;
+
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
KVM_DEBUGREG_RELOAD = 4,
};
+struct kvm_mtrr_range {
+ u64 base;
+ u64 mask;
+ struct list_head node;
+};
+
+struct kvm_mtrr {
+ struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
+ mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 deftype;
+
+ struct list_head head;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -368,6 +382,7 @@ struct kvm_vcpu_arch {
int32_t apic_arb_prio;
int mp_state;
u64 ia32_misc_enable_msr;
+ u64 smbase;
bool tpr_access_reporting;
u64 ia32_xss;
@@ -471,8 +486,9 @@ struct kvm_vcpu_arch {
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
+ bool smi_pending; /* SMI queued after currently running handler */
- struct mtrr_state_type mtrr_state;
+ struct kvm_mtrr mtrr_state;
u64 pat;
unsigned switch_db_regs;
@@ -588,10 +604,12 @@ struct kvm_arch {
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+ atomic_t assigned_device_count;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
- int vapics_in_nmi_mode;
+ atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
@@ -637,6 +655,8 @@ struct kvm_arch {
#endif
bool boot_vcpu_runs_old_kvmclock;
+
+ u64 disabled_quirks;
};
struct kvm_vm_stat {
@@ -689,12 +709,13 @@ struct msr_data {
struct kvm_lapic_irq {
u32 vector;
- u32 delivery_mode;
- u32 dest_mode;
- u32 level;
- u32 trig_mode;
+ u16 delivery_mode;
+ u16 dest_mode;
+ bool level;
+ u16 trig_mode;
u32 shorthand;
u32 dest_id;
+ bool msi_redir_hint;
};
struct kvm_x86_ops {
@@ -706,19 +727,20 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ bool (*cpu_has_high_real_mode_segbase)(void);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
- void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+ void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -836,6 +858,8 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
+ /* pmu operations of sub-arch */
+ const struct kvm_pmu_ops *pmu_ops;
};
struct kvm_arch_async_pf {
@@ -871,7 +895,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -882,7 +906,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
@@ -890,7 +914,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
struct kvm_irq_mask_notifier {
void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
@@ -938,7 +961,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
struct x86_emulate_ctxt;
@@ -967,7 +990,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
@@ -1110,6 +1133,14 @@ enum {
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK (1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
+
+#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#define KVM_ADDRESS_SPACE_NUM 2
+
+#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
/*
* Hardware virtualization extension instructions may fault if a
@@ -1144,7 +1175,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
unsigned long address);
@@ -1168,16 +1199,9 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
int kvm_is_in_guest(void);
-void kvm_pmu_init(struct kvm_vcpu *vcpu);
-void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
-void kvm_pmu_reset(struct kvm_vcpu *vcpu);
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+int __x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem);
+int x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 2d29197bd2fb..19c099afa861 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -21,6 +21,7 @@
#ifndef _ASM_X86_LIVEPATCH_H
#define _ASM_X86_LIVEPATCH_H
+#include <asm/setup.h>
#include <linux/module.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e8daee7c5c9..804a3a6030ca 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -23,7 +23,7 @@ extern struct static_key rdpmc_always_available;
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (static_key_true(&rdpmc_always_available) ||
+ if (static_key_false(&rdpmc_always_available) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b962e0fe5658..462594320d39 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -5,7 +5,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#include <asm/io.h>
#include <asm/x86_init.h>
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2562e303405b..867da5bbb4a3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
#endif
unsigned long gs;
- /* Floating point and extended processor state */
- struct fpu fpu;
-
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+
+ /* Floating point and extended processor state */
+ struct fpu fpu;
+ /*
+ * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
+ * the end.
+ */
};
/*
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6167fd798188..655e07a48f6c 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -41,5 +41,6 @@ struct pvclock_wall_clock {
#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
#define PVCLOCK_GUEST_STOPPED (1 << 1)
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..628954ceede1 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;
ret_flags = src->flags;
- rdtsc_barrier();
*cycles = ret;
*flags = ret_flags;
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 8378b8c9109c..bb658211edad 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -11,7 +11,7 @@
#define BASE_BAUD (1843200/16)
/* Standard COM flags (except for COM4, because of the 8514 problem) */
-#ifdef CONFIG_SERIAL_DETECT_IRQ
+#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ)
# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ)
#else
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f69e06b283fb..11af24e09c8a 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -60,17 +60,24 @@ static inline void x86_ce4100_early_setup(void) { }
#ifndef _SETUP
#include <asm/espfix.h>
+#include <linux/kernel.h>
/*
* This is set up by the setup-routine at boot-time
*/
extern struct boot_params boot_params;
+extern char _text[];
static inline bool kaslr_enabled(void)
{
return !!(boot_params.hdr.loadflags & KASLR_FLAG);
}
+static inline unsigned long kaslr_offset(void)
+{
+ return (unsigned long)&_text - __START_KERNEL;
+}
+
/*
* Do NOT EVER look at the BIOS memory size location.
* It does not work on many machines.
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 960a8a9dc4ab..0f457e6eab18 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -32,6 +32,7 @@
#define E820_ACPI 3
#define E820_NVS 4
#define E820_UNUSABLE 5
+#define E820_PMEM 7
/*
* This is a non-standardized way to represent ADR or NVDIMM regions that
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index ce6068dbcfbc..f36d56bd7632 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -108,6 +108,8 @@
#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
/*
* Implementation recommendations. Indicates which behaviors the hypervisor
@@ -199,6 +201,17 @@
#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS \
+ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d7dcef58aefa..cd54147cb365 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {
#define KVM_IRQCHIP_IOAPIC 2
#define KVM_NR_IRQCHIPS 3
+#define KVM_RUN_X86_SMM (1 << 0)
+
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
@@ -281,6 +283,7 @@ struct kvm_reinject_control {
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+#define KVM_VCPUEVENT_VALID_SMM 0x00000008
/* Interrupt shadow states */
#define KVM_X86_SHADOW_INT_MOV_SS 0x01
@@ -309,7 +312,13 @@ struct kvm_vcpu_events {
} nmi;
__u32 sipi_vector;
__u32 flags;
- __u32 reserved[10];
+ struct {
+ __u8 smm;
+ __u8 pending;
+ __u8 smm_inside_nmi;
+ __u8 latched_init;
+ } smi;
+ __u32 reserved[9];
};
/* for KVM_GET/SET_DEBUGREGS */
@@ -345,4 +354,7 @@ struct kvm_xcrs {
struct kvm_sync_regs {
};
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 665c6b7d2ea9..0c26b1b44e51 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -12,11 +12,13 @@ ENTRY(wakeup_pmode_return)
wakeup_pmode_return:
movw $__KERNEL_DS, %ax
movw %ax, %ss
- movw %ax, %ds
- movw %ax, %es
movw %ax, %fs
movw %ax, %gs
+ movw $__USER_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
# reload the gdt, as we need the full 32 bit address
lidt saved_idt
lldt saved_ldt
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 28eba2d38b15..f813261d9740 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu)
int irq, vector;
struct apic_chip_data *data;
- /*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
- */
- raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
data = apic_chip_data(irq_get_irq_data(irq));
@@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, data->domain))
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
- raw_spin_unlock(&vector_lock);
}
/*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
*/
void setup_vector_irq(int cpu)
{
int irq;
+ lockdep_assert_held(&vector_lock);
/*
* On most of the platforms, legacy PIC delivers the interrupts on the
* boot cpu. But there are certain platforms where PIC interrupts are
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 5de7f4c56971..52c8e3c7789d 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -98,4 +98,4 @@ static int __init sbf_init(void)
return 0;
}
-module_init(sbf_init);
+arch_initcall(sbf_init);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..58118e207a69 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9fc5e3d9d9c8..922c5e0cea4c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -742,7 +742,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- fpu__init_system(c);
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -754,6 +753,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_bsp_init(c);
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+ fpu__init_system(c);
}
void __init early_cpu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5801a14f7524..3658de47900f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -357,34 +357,24 @@ void x86_release_hardware(void)
*/
int x86_add_exclusive(unsigned int what)
{
- int ret = -EBUSY, i;
-
- if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
- return 0;
+ int i;
- mutex_lock(&pmc_reserve_mutex);
- for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
- if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
- goto out;
+ if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto fail_unlock;
+ }
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ mutex_unlock(&pmc_reserve_mutex);
}
- atomic_inc(&x86_pmu.lbr_exclusive[what]);
- ret = 0;
+ atomic_inc(&active_events);
+ return 0;
-out:
+fail_unlock:
mutex_unlock(&pmc_reserve_mutex);
-
- /*
- * Assuming that all exclusive events will share the PMI handler
- * (which checks active_events for whether there is work to do),
- * we can bump active_events counter right here, except for
- * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init()
- * path, which already bumps active_events for them.
- */
- if (!ret && what != x86_lbr_exclusive_lbr)
- atomic_inc(&active_events);
-
- return ret;
+ return -EBUSY;
}
void x86_del_exclusive(unsigned int what)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 7795f3f8b1d5..43dd672d788b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -530,5 +530,4 @@ static __init int bts_init(void)
return perf_pmu_register(&bts_pmu, "intel_bts", -1);
}
-
-module_init(bts_init);
+arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 159887c3a89d..183de719628d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -1106,5 +1106,4 @@ static __init int pt_init(void)
return ret;
}
-
-module_init(pt_init);
+arch_initcall(pt_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 5ee771859b6f..1f4acd68b98b 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -65,7 +65,7 @@ static int __init add_bus_probe(void)
return of_platform_bus_probe(NULL, ce4100_ids, NULL);
}
-module_init(add_bus_probe);
+device_initcall(add_bus_probe);
#ifdef CONFIG_PCI
struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..a102564d08eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type)
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PMEM:
case E820_PRAM:
printk(KERN_CONT "persistent (type %u)", type);
break;
@@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
- case E820_PRAM: return "Persistent RAM";
+ case E820_PRAM: return "Persistent Memory (legacy)";
+ case E820_PMEM: return "Persistent Memory";
default: return "reserved";
}
}
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory like device memory, i.e. reserve it
+ * for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_RESERVED:
+ case E820_PRAM:
+ case E820_PMEM:
+ return false;
+ default:
+ return true;
+ }
+}
+
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
@@ -952,9 +974,7 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (((e820.map[i].type != E820_RESERVED) &&
- (e820.map[i].type != E820_PRAM)) ||
- res->start < (1ULL<<20)) {
+ if (do_mark_busy(e820.map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
@@ -1123,7 +1143,8 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 5cb9a4d6f623..9f9cc682e561 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -546,6 +546,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_BDW_D_IDS(&gen8_stolen_funcs),
INTEL_CHV_IDS(&chv_stolen_funcs),
INTEL_SKL_IDS(&gen9_stolen_funcs),
+ INTEL_BXT_IDS(&gen9_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 89427d8d4fc5..eec40f595ab9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -175,7 +175,9 @@ static __init void early_serial_init(char *s)
}
if (*s) {
- if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730e7b08..ce95676abd60 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
init_espfix_random();
/* The rest is the same as for any other processor */
- init_espfix_ap();
+ init_espfix_ap(0);
}
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
{
- unsigned int cpu, page;
+ unsigned int page;
unsigned long addr;
pud_t pud, *pud_p;
pmd_t pmd, *pmd_p;
pte_t pte, *pte_p;
- int n;
+ int n, node;
void *stack_page;
pteval_t ptemask;
/* We only have to do this once... */
- if (likely(this_cpu_read(espfix_stack)))
+ if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
- cpu = smp_processor_id();
addr = espfix_base_addr(cpu);
page = cpu/ESPFIX_STACKS_PER_PAGE;
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
if (stack_page)
goto unlock_done;
+ node = cpu_to_node(cpu);
ptemask = __supported_pte_mask;
pud_p = &espfix_pud_page[pud_index(addr)];
pud = *pud_p;
if (!pud_present(pud)) {
- pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
pmd_p = pmd_offset(&pud, addr);
pmd = *pmd_p;
if (!pmd_present(pmd)) {
- pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
}
pte_p = pte_offset_kernel(&pmd, addr);
- stack_page = (void *)__get_free_page(GFP_KERNEL);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
unlock_done:
mutex_unlock(&espfix_init_mutex);
done:
- this_cpu_write(espfix_stack, addr);
- this_cpu_write(espfix_waddr, (unsigned long)stack_page
- + (addr & ~PAGE_MASK));
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index fc878fee6a51..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <linux/sched.h>
+
/*
* Initialize the TS bit in CR0 according to the style of context-switches
* we are using:
@@ -95,11 +97,12 @@ static void __init fpu__init_system_mxcsr(void)
unsigned int mask = 0;
if (cpu_has_fxsr) {
- struct fxregs_state fx_tmp __aligned(32) = { };
+ /* Static because GCC does not get 16-byte stack alignment right: */
+ static struct fxregs_state fxregs __initdata;
- asm volatile("fxsave %0" : "+m" (fx_tmp));
+ asm volatile("fxsave %0" : "+m" (fxregs));
- mask = fx_tmp.mxcsr_mask;
+ mask = fxregs.mxcsr_mask;
/*
* If zero then use the default features mask,
@@ -135,6 +138,43 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += xstate_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * it be at the end of 'thread_struct' and that
+ * 'thread_struct' be at the end of 'task_struct'. If
+ * you hit a compile error here, check the structure to
+ * see if something got added to the end.
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+ CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+ arch_task_struct_size = task_size;
+}
+
/*
* Set up the xstate_size based on the legacy FPU context size.
*
@@ -286,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
+ fpu__init_task_struct_size();
fpu__init_system_ctx_switch();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5a4668136e98..f129a9af6357 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -161,11 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
- kasan_map_early_shadow(early_level4_pgt);
-
- /* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
+ clear_page(init_level4_pgt);
+
+ kasan_early_init();
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -177,12 +178,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
- kasan_map_early_shadow(init_level4_pgt);
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e5c27f729a38..1d40ca8a73f2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -516,38 +516,9 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
-#ifdef CONFIG_KASAN
-#define FILL(VAL, COUNT) \
- .rept (COUNT) ; \
- .quad (VAL) ; \
- .endr
-
-NEXT_PAGE(kasan_zero_pte)
- FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pmd)
- FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pud)
- FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
-
-#undef FILL
-#endif
-
-
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
-#ifdef CONFIG_KASAN
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-NEXT_PAGE(kasan_zero_page)
- .skip PAGE_SIZE
-#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..c7dfe1be784e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -347,14 +347,22 @@ int check_irq_vectors_for_cpu_disable(void)
if (!desc)
continue;
+ /*
+ * Protect against concurrent action removal,
+ * affinity changes etc.
+ */
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
- if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
+ raw_spin_unlock(&desc->lock);
continue;
+ }
+ raw_spin_unlock(&desc->lock);
/*
* A single irq may be mapped to multiple
* cpu's vector_irq[] (for example IOAPIC cluster
@@ -385,6 +393,9 @@ int check_irq_vectors_for_cpu_disable(void)
* vector. If the vector is marked in the used vectors
* bitmap or an irq is assigned to it, we don't count
* it as available.
+ *
+ * As this is an inaccurate snapshot anyway, we can do
+ * this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
@@ -486,6 +497,11 @@ void fixup_irqs(void)
*/
mdelay(1);
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
@@ -497,9 +513,9 @@ void fixup_irqs(void)
irq = __this_cpu_read(vector_irq[vector]);
desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
- raw_spin_lock(&desc->lock);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca05f86481aa..ca83f7ac388b 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -72,15 +72,16 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
unsigned long cmdline_len)
{
char *cmdline_ptr = ((char *)params) + cmdline_offset;
- unsigned long cmdline_ptr_phys, len;
+ unsigned long cmdline_ptr_phys, len = 0;
uint32_t cmdline_low_32, cmdline_ext_32;
- memcpy(cmdline_ptr, cmdline, cmdline_len);
if (image->type == KEXEC_TYPE_CRASH) {
- len = sprintf(cmdline_ptr + cmdline_len - 1,
- " elfcorehdr=0x%lx", image->arch.elf_load_addr);
- cmdline_len += len;
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
}
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
cmdline_ptr[cmdline_len - 1] = '\0';
pr_debug("Final command line is: %s\n", cmdline_ptr);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1681504e44a4..47190bd399e7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
apic_write(APIC_EOI, APIC_EOI_ACK);
}
-void kvm_guest_cpu_init(void)
+static void kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
return;
@@ -688,7 +688,7 @@ static inline void spin_time_accum_blocked(u64 start)
static struct dentry *d_spin_debug;
static struct dentry *d_kvm_debug;
-struct dentry *kvm_init_debugfs(void)
+static struct dentry *kvm_init_debugfs(void)
{
d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
if (!d_kvm_debug)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 42caaef897c8..49487b488061 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,6 +24,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/memblock.h>
+#include <linux/sched.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
@@ -217,8 +218,10 @@ static void kvm_shutdown(void)
void __init kvmclock_init(void)
{
+ struct pvclock_vcpu_time_info *vcpu_time;
unsigned long mem;
- int size;
+ int size, cpu;
+ u8 flags;
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
@@ -264,7 +267,14 @@ void __init kvmclock_init(void)
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+ pvclock_set_flags(~0);
+
+ cpu = get_cpu();
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+ if (flags & PVCLOCK_COUNTS_FROM_ZERO)
+ set_sched_clock_stable();
+ put_cpu();
}
int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 11546b462fa6..819ab3f9c9c7 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -26,6 +26,7 @@
#include <asm/io_apic.h>
#include <asm/debugreg.h>
#include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
#ifdef CONFIG_KEXEC_FILE
static struct kexec_file_ops *kexec_file_loaders[] = {
@@ -335,7 +336,7 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
- (unsigned long)&_text - __START_KERNEL);
+ kaslr_offset());
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..d05bd2e2ee91 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
*
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3420c874ddc5..64f90f53bb85 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -1,53 +1,82 @@
/*
* Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
*/
-#include <linux/memblock.h>
#include <linux/platform_device.h>
-#include <linux/slab.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
#include <asm/e820.h>
-#include <asm/page_types.h>
-#include <asm/setup.h>
-static __init void register_pmem_device(struct resource *res)
+static void e820_pmem_release(struct device *dev)
{
- struct platform_device *pdev;
- int error;
+ struct nvdimm_bus *nvdimm_bus = dev->platform_data;
- pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
- if (!pdev)
- return;
+ if (nvdimm_bus)
+ nvdimm_bus_unregister(nvdimm_bus);
+}
- error = platform_device_add_resources(pdev, res, 1);
- if (error)
- goto out_put_pdev;
+static struct platform_device e820_pmem = {
+ .name = "e820_pmem",
+ .id = -1,
+ .dev = {
+ .release = e820_pmem_release,
+ },
+};
- error = platform_device_add(pdev);
- if (error)
- goto out_put_pdev;
- return;
+static const struct attribute_group *e820_pmem_attribute_groups[] = {
+ &nvdimm_bus_attribute_group,
+ NULL,
+};
-out_put_pdev:
- dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
- platform_device_put(pdev);
-}
+static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
+ &nd_region_attribute_group,
+ &nd_device_attribute_group,
+ NULL,
+};
-static __init int register_pmem_devices(void)
+static __init int register_e820_pmem(void)
{
- int i;
+ static struct nvdimm_bus_descriptor nd_desc;
+ struct device *dev = &e820_pmem.dev;
+ struct nvdimm_bus *nvdimm_bus;
+ int rc, i;
+
+ rc = platform_device_register(&e820_pmem);
+ if (rc)
+ return rc;
+
+ nd_desc.attr_groups = e820_pmem_attribute_groups;
+ nd_desc.provider_name = "e820";
+ nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+ if (!nvdimm_bus)
+ goto err;
+ dev->platform_data = nvdimm_bus;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = ei->addr,
+ .end = ei->addr + ei->size - 1,
+ };
+ struct nd_region_desc ndr_desc;
+
+ if (ei->type != E820_PRAM)
+ continue;
- if (ei->type == E820_PRAM) {
- struct resource res = {
- .flags = IORESOURCE_MEM,
- .start = ei->addr,
- .end = ei->addr + ei->size - 1,
- };
- register_pmem_device(&res);
- }
+ memset(&ndr_desc, 0, sizeof(ndr_desc));
+ ndr_desc.res = &res;
+ ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+ ndr_desc.numa_node = NUMA_NO_NODE;
+ if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+ goto err;
}
return 0;
+
+ err:
+ dev_err(dev, "failed to register legacy persistent memory ranges\n");
+ platform_device_unregister(&e820_pmem);
+ return -ENXIO;
}
-device_initcall(register_pmem_devices);
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- *dst = *src;
+ memcpy(dst, src, arch_task_struct_size);
return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c09c99ccf3e3..f73c962fe636 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -128,8 +128,8 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -184,7 +184,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
*/
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
+ (struct user_desc __user *)tls, 0);
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 843f92e4c711..71d7849a07f7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -150,8 +150,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
return get_desc_base(&t->thread.tls_array[tls]);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
int err;
struct pt_regs *childregs;
@@ -207,10 +207,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
#ifdef CONFIG_IA32_EMULATION
if (is_ia32_task())
err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
+ (struct user_desc __user *)tls, 0);
else
#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 265a6fdea8b7..80f874bf999e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -461,19 +461,18 @@ static void __init e820_reserve_setup_data(void)
{
struct setup_data *data;
u64 pa_data;
- int found = 0;
pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return;
+
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
e820_update_range(pa_data, sizeof(*data)+data->len,
E820_RAM, E820_RESERVED_KERN);
- found = 1;
pa_data = data->next;
early_memunmap(data, sizeof(*data));
}
- if (!found)
- return;
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
memcpy(&e820_saved, &e820, sizeof(struct e820map));
@@ -836,7 +835,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
if (kaslr_enabled()) {
pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
- (unsigned long)&_text - __START_KERNEL,
+ kaslr_offset(),
__START_KERNEL,
__START_KERNEL_map,
MODULES_VADDR-1);
@@ -1105,6 +1104,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
+ if (efi_enabled(EFI_BOOT))
+ efi_find_mirror();
+
/*
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8add66b22f33..b1f3ed9c7a9e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -171,11 +171,6 @@ static void smp_callin(void)
apic_ap_setup();
/*
- * Need to setup vector mappings before we enable interrupts.
- */
- setup_vector_irq(smp_processor_id());
-
- /*
* Save our processor parameters. Note: this information
* is needed for clock calibration.
*/
@@ -239,18 +234,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
-#endif
-
- /*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
+ * Lock vector_lock and initialize the vectors on this cpu
+ * before setting the cpu online. We must set it online with
+ * vector_lock held to prevent a concurrent setup/teardown
+ * from seeing a half valid vector space.
*/
lock_vector_lock();
+ setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
cpu_set_state_online(smp_processor_id());
@@ -854,6 +844,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
+ /*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap(cpu);
+#endif
+
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -995,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
+ /*
+ * We have to walk the irq descriptors to setup the vector
+ * space for the cpu which comes online. Prevent irq
+ * alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
err = do_boot_cpu(apicid, cpu, tidle);
+
if (err) {
+ irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1014,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
+ irq_unlock_sparse();
+
return 0;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..7437b41f6a47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ee22c1d93ae5..b034b1b14b9c 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -72,7 +72,7 @@ asmlinkage __visible void vsmp_irq_enable(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
-static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
switch (type) {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 413a7bf9efbb..d8a1d56276e1 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,15 +86,16 @@ config KVM_MMU_AUDIT
auditing of KVM MMU events at runtime.
config KVM_DEVICE_ASSIGNMENT
- bool "KVM legacy PCI device assignment support"
+ bool "KVM legacy PCI device assignment support (DEPRECATED)"
depends on KVM && PCI && IOMMU_API
- default y
+ default n
---help---
Provide support for legacy PCI device assignment through KVM. The
kernel now also supports a full featured userspace device driver
- framework through VFIO, which supersedes much of this support.
+ framework through VFIO, which supersedes this support and provides
+ better security.
- If unsure, say Y.
+ If unsure, say N.
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 16e8f962eaad..67d215cb8953 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,10 +12,10 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
-kvm-intel-y += vmx.o
-kvm-amd-y += svm.o
+kvm-intel-y += vmx.o pmu_intel.o
+kvm-amd-y += svm.o pmu_amd.o
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9f705e618af5..2fbea2544f24 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,12 +16,14 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */
#include <asm/user.h>
#include <asm/fpu/xstate.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
#include "trace.h"
+#include "pmu.h"
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
@@ -95,7 +97,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu);
+ vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+ if (vcpu->arch.eager_fpu)
+ kvm_x86_ops->fpu_activate(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
@@ -109,7 +113,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- kvm_pmu_cpuid_update(vcpu);
+ kvm_pmu_refresh(vcpu);
return 0;
}
@@ -413,6 +417,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capability word 9 */
@@ -589,7 +599,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 3: /* Processor serial number */
case 5: /* MONITOR/MWAIT */
- case 6: /* Thermal management */
case 0xC0000002:
case 0xC0000003:
case 0xC0000004:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 496b3695d3d3..dd05b9cef6ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 630bcb0d7a04..e7a4fde5d631 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/debugreg.h>
#include "x86.h"
#include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
static inline void
register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
- ulong mask;
+ ulong *preg = reg_rmw(ctxt, reg);
- if (ctxt->ad_bytes == sizeof(unsigned long))
- mask = ~0UL;
- else
- mask = ad_mask(ctxt);
- masked_increment(reg_rmw(ctxt, reg), mask, inc);
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
}
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2262,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ return edx & bit(X86_FEATURE_LM);
+}
+
+#define GET_SMSTATE(type, smbase, offset) \
+ ({ \
+ type __val; \
+ int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
+ sizeof(__val), NULL); \
+ if (r != X86EMUL_CONTINUE) \
+ return X86EMUL_UNHANDLEABLE; \
+ __val; \
+ })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->d = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->p = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+ u32 base3;
+
+ offset = 0x7e00 + n * 16;
+
+ selector = GET_SMSTATE(u16, smbase, offset);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ base3 = GET_SMSTATE(u32, smbase, offset + 12);
+
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr4)
+{
+ int bad;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u16 selector;
+ u32 val, cr0, cr4;
+ int i;
+
+ cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
+ ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+ val = GET_SMSTATE(u32, smbase, 0x7fcc);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7fc8);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc4);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc0);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_32(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u64 val, cr0, cr4;
+ u32 base3;
+ u16 selector;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+ ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+ val = GET_SMSTATE(u32, smbase, 0x7f68);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7f60);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
+ ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
+ cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+ val = GET_SMSTATE(u64, smbase, 0x7ed0);
+ ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e90);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e70);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_64(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long cr0, cr4, efer;
+ u64 smbase;
+ int ret;
+
+ if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+ return emulate_ud(ctxt);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
+ * to read_std/write_std are not virtual.
+ *
+ * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+ */
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+ /* FIXME: should triple fault */
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
+ ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+ return X86EMUL_CONTINUE;
+}
+
static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
@@ -2573,6 +2824,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
return true;
}
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ /* fall through */
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
struct tss_segment_16 *tss)
{
@@ -2849,7 +3124,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ulong old_tss_base =
ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
- ulong desc_addr;
+ ulong desc_addr, dr7;
/* FIXME: old_tss_base == ~0 ? */
@@ -2934,6 +3209,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
return ret;
}
@@ -3840,7 +4118,7 @@ static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
+ I(SrcMemFAddr | ImplicitOps, em_call_far),
I(SrcMem | NearBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps, em_jmp_far),
I(SrcMem | Stack, em_push), D(Undefined),
@@ -4173,7 +4451,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- DI(ImplicitOps, rsm),
+ II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
@@ -4871,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4910,6 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if (ctxt->rep_prefix && (ctxt->d & String)) {
/* All REP prefixes have the same first termination condition */
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
ctxt->eip = ctxt->_eip;
ctxt->eflags &= ~X86_EFLAGS_RF;
goto done;
@@ -4953,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4dce6f8b6129..f90952f64e79 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -305,7 +305,7 @@ static void pit_do_work(struct kthread_work *work)
* LVT0 to NMI delivery. Other PIC interrupts are just sent to
* VCPU0, and only if its LVT0 is in EXTINT mode.
*/
- if (kvm->arch.vapics_in_nmi_mode > 0)
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_apic_nmi_wd_deliver(vcpu);
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 28146f03c514..856f79105bb5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
+ irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
cancel_delayed_work_sync(&ioapic->eoi_inject);
- if (ioapic) {
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
- kvm->arch.vioapic = NULL;
- kfree(ioapic);
- }
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
}
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 7dbced309ddb..5c520ebf6343 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
goto out_unmap;
}
+ kvm_arch_start_assignment(kvm);
pci_set_dev_assigned(pdev);
dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
iommu_detach_device(domain, &pdev->dev);
pci_clear_dev_assigned(pdev);
+ kvm_arch_end_assignment(kvm);
dev_info(&pdev->dev, "kvm deassign device\n");
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3ac025..9efff9e5b58c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
#include "ioapic.h"
+#include "lapic.h"
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
line_status);
}
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
- return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map)
{
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq)) {
+ kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
irq->dest_id, irq->dest_mode))
continue;
- if (!kvm_is_dm_lowest_prio(irq)) {
+ if (!kvm_lowest_prio_delivery(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
+ irq->msi_redir_hint = ((e->msi.address_lo
+ & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
irq->shorthand = 0;
- /* TODO Deal with RH bit of MSI message address */
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f44b..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4c7deb4f78a1..2a5ca97c263b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+{
+ u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+
+ apic_set_reg(apic, APIC_ID, id << 24);
+ apic_set_reg(apic, APIC_LDR, ldr);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = map->logical_map[cid];
- if (irq->delivery_mode == APIC_DM_LOWEST) {
+ if (kvm_lowest_prio_delivery(irq)) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
@@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- apic_debug("Ignoring guest SMI\n");
+ result = 1;
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_NMI:
@@ -914,9 +925,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
if (apic_x2apic_mode(apic))
irq.dest_id = icr_high;
else
@@ -926,10 +938,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+ "msi_redir_hint 0x%x\n",
icr_high, icr_low, irq.shorthand, irq.dest_id,
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
+ irq.vector, irq.msi_redir_hint);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
@@ -1244,16 +1257,17 @@ static void start_apic_timer(struct kvm_lapic *apic)
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
- if (apic_lvt_nmi_mode(lvt0_val)) {
- if (!nmi_wd_enabled) {
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
apic_debug("Receive NMI setting on APIC_LVT0 "
"for cpu %d\n", apic->vcpu->vcpu_id);
- apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
- }
- } else if (nmi_wd_enabled)
- apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
}
static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
@@ -1541,9 +1555,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & X2APIC_ENABLE) {
if (value & X2APIC_ENABLE) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_apic_set_ldr(apic, ldr);
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
} else
kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1562,7 +1574,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic;
int i;
@@ -1576,19 +1588,23 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- kvm_apic_set_id(apic, vcpu->vcpu_id);
+ if (!init_event)
+ kvm_apic_set_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ apic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
- kvm_apic_set_ldr(apic, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
apic_set_reg(apic, APIC_ESR, 0);
apic_set_reg(apic, APIC_ICR, 0);
apic_set_reg(apic, APIC_ICR2, 0);
@@ -1717,7 +1733,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
- kvm_lapic_reset(vcpu);
+ kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -1808,6 +1824,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
@@ -2049,11 +2066,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
return;
- pe = xchg(&apic->pending_events, 0);
+ /*
+ * INITs are latched while in SMM. Because an SMM CPU cannot
+ * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+ * and delay processing of INIT until the next RSM.
+ */
+ if (is_smm(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return;
+ }
+ pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
- kvm_lapic_reset(vcpu);
- kvm_vcpu_reset(vcpu);
+ kvm_lapic_reset(vcpu, true);
+ kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383fc1e7..71952748222a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -26,6 +26,7 @@ struct kvm_lapic {
struct kvm_vcpu *vcpu;
bool sw_enabled;
bool irr_pending;
+ bool lvt0_in_nmi_mode;
/* Number of bits set in ISR. */
s16 isr_count;
/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -48,7 +49,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -150,7 +151,18 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.apic->pending_events;
+ return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b73337634214..44171462bd2a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
}
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
{
- return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+ return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
}
-static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned access)
{
- unsigned int gen = kvm_current_mmio_generation(kvm);
+ unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
return (spte & ~mask) & ~PAGE_MASK;
}
-static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(kvm, sptep, gfn, access);
+ mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
return false;
}
-static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
unsigned int kvm_gen, spte_gen;
- kvm_gen = kvm_current_mmio_generation(kvm);
+ kvm_gen = kvm_current_mmio_generation(vcpu);
spte_gen = get_mmio_spte_generation(spte);
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -804,30 +804,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
+ gfn_t gfn;
int i;
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
}
kvm->arch.indirect_shadow_pages++;
}
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
+ gfn_t gfn;
int i;
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count -= 1;
WARN_ON(linfo->write_count < 0);
@@ -835,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
kvm->arch.indirect_shadow_pages--;
}
-static int has_wrprotected_page(struct kvm *kvm,
+static int has_wrprotected_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
int level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
- slot = gfn_to_memslot(kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@@ -858,8 +864,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
page_size = kvm_host_page_size(kvm, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
if (page_size >= KVM_HPAGE_SIZE(i))
ret = i;
else
@@ -875,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
{
struct kvm_memory_slot *slot;
- slot = gfn_to_memslot(vcpu->kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
(no_dirty_log && slot->dirty_bitmap))
slot = NULL;
@@ -900,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ if (has_wrprotected_page(vcpu, large_gfn, level))
break;
return level - 1;
@@ -1042,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
/*
* Take gfn and return the reverse mapping to it.
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_rmap(gfn, level, slot);
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ return __gfn_to_rmap(gfn, sp->role.level, slot);
}
static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1065,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmapp);
}
@@ -1077,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(kvm, gfn, sp);
pte_list_remove(spte, rmapp);
}
@@ -1142,6 +1149,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
return NULL;
}
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
+ _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
+ _spte_ = rmap_get_next(_iter_))
+
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1217,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1232,12 +1240,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1259,12 +1263,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1342,42 +1342,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
bool write_protected = false;
- slot = gfn_to_memslot(kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, true);
+ write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
}
return write_protected;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
{
u64 *sptep;
struct rmap_iterator iter;
- int need_tlb_flush = 0;
+ bool flush = false;
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
+ rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
- need_tlb_flush = 1;
+ flush = true;
}
- return need_tlb_flush;
+ return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ return kvm_zap_rmapp(kvm, rmapp);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1397,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1403,7 +1406,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (pte_write(*ptep)) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
+ goto restart;
} else {
new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1417,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
- sptep = rmap_get_next(&iter);
}
}
@@ -1424,6 +1426,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ unsigned long *rmap;
+ int level;
+
+ /* private field. */
+ unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+ iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ struct kvm_memory_slot *slot, int start_level,
+ int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ if (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
@@ -1435,48 +1505,36 @@ static int kvm_handle_hva_range(struct kvm *kvm,
int level,
unsigned long data))
{
- int j;
- int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ struct slot_rmap_walk_iterator iterator;
+ int ret = 0;
+ int i;
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for (j = PT_PAGE_TABLE_LEVEL;
- j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
- unsigned long idx, idx_end;
- unsigned long *rmapp;
- gfn_t gfn = gfn_start;
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
/*
- * {idx(page_j) | page_j intersects with
- * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
*/
- idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
- idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
- rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
- for (; idx <= idx_end;
- ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
- ret |= handler(kvm, rmapp++, memslot,
- gfn, j, data);
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL,
+ gfn_start, gfn_end - 1,
+ &iterator)
+ ret |= handler(kvm, iterator.rmap, memslot,
+ iterator.gfn, iterator.level, data);
}
}
@@ -1518,16 +1576,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
- }
+
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
@@ -1548,15 +1603,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
- }
out:
return young;
}
@@ -1570,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1990,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+ protected |= rmap_write_protect(vcpu, sp->gfn);
if (protected)
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2088,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
- if (rmap_write_protect(vcpu->kvm, gfn))
+ if (rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
kvm_sync_pages(vcpu, gfn);
- account_shadowed(vcpu->kvm, gfn);
+ account_shadowed(vcpu->kvm, sp);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
@@ -2274,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_mmu_unlink_parents(kvm, sp);
if (!sp->role.invalid && !sp->role.direct)
- unaccount_shadowed(kvm, sp->gfn);
+ unaccount_shadowed(kvm, sp);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
@@ -2386,111 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
- u64 start, u64 end)
-{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
-
- if (!mtrr_state->enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state->fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
-
- if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
- continue;
-
- base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
- (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
- (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
-
- if ((start & mask) != (base & mask))
- continue;
-
- curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
- }
-
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if (prev_match != 0xFF)
- return prev_match;
-
- return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u8 mtrr;
-
- mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
- (gfn << PAGE_SHIFT) + PAGE_SIZE);
- if (mtrr == 0xfe || mtrr == 0xff)
- mtrr = MTRR_TYPE_WRBACK;
- return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
@@ -2533,6 +2479,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
+static bool kvm_is_mmio_pfn(pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+
+ return true;
+}
+
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
@@ -2541,7 +2495,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
- if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
+ if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
spte = PT_PRESENT_MASK;
@@ -2560,7 +2514,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_reserved_pfn(pfn));
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
@@ -2578,7 +2532,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level))
+ has_wrprotected_page(vcpu, gfn, level))
goto done;
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2602,7 +2556,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
if (pte_access & ACC_WRITE_MASK) {
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
spte |= shadow_dirty_mask;
}
@@ -2692,15 +2646,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
u64 *start, u64 *end)
{
struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
unsigned access = sp->role.access;
int i, ret;
gfn_t gfn;
gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
- if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
return -1;
- ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+ ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
if (ret <= 0)
return -1;
@@ -2818,7 +2774,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
return 1;
if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
+ kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
return 0;
}
@@ -2841,7 +2797,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
@@ -2933,7 +2889,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* Compare with set_spte where instead shadow_dirty_mask is set.
*/
if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
return true;
}
@@ -3388,7 +3344,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
- if (!check_mmio_spte(vcpu->kvm, spte))
+ if (!check_mmio_spte(vcpu, spte))
return RET_MMIO_PF_INVALID;
if (direct)
@@ -3460,7 +3416,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3475,10 +3431,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
+ struct kvm_memory_slot *slot;
bool async;
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
@@ -3492,11 +3450,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
return false;
}
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ int page_num = KVM_PAGES_PER_HPAGE(level);
+
+ gfn &= ~(page_num - 1);
+
+ return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
@@ -3522,9 +3489,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (mapping_level_dirty_bitmap(vcpu, gfn) ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
+ force_pt_level = 1;
+ else
+ force_pt_level = 0;
+
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
+ if (level > PT_DIRECTORY_LEVEL &&
+ !check_hugepage_cache_consistency(vcpu, gfn, level))
+ level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
@@ -3590,7 +3565,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
-static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
@@ -3600,7 +3575,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
}
(*nr_present)++;
- mark_mmio_spte(kvm, sptep, gfn, access);
+ mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
@@ -3878,6 +3853,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
+ context->base_role.smm = is_smm(vcpu);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -3939,6 +3915,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
= smep && !is_write_protection(vcpu);
context->base_role.smap_andnot_wp
= smap && !is_write_protection(vcpu);
+ context->base_role.smm = is_smm(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
@@ -4110,7 +4087,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
+ r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
@@ -4222,6 +4199,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.nxe = 1;
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
+ mask.smm = 1;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4420,36 +4398,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
{
- gfn_t last_gfn;
- int i;
+ struct slot_rmap_walk_iterator iterator;
bool flush = false;
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
- spin_lock(&kvm->mmu_lock);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+
+ return flush;
+}
+
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
}
}
spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+ return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
/*
* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4482,9 +4539,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
pfn_t pfn;
struct kvm_mmu_page *sp;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
@@ -4499,71 +4555,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
!kvm_is_reserved_pfn(pfn) &&
PageTransCompound(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
need_tlb_flush = 1;
- } else
- sptep = rmap_get_next(&iter);
+ goto restart;
+ }
}
return need_tlb_flush;
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- bool flush = false;
- unsigned long *rmapp;
- unsigned long last_index, index;
-
+ /* FIXME: const-ify all uses of struct kvm_memory_slot. */
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[0];
- last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
- memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- if (flush) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
+ slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+ kvm_mmu_zap_collapsible_spte, true);
spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- unsigned long *rmapp;
- unsigned long last_index, index;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn,
- PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_clear_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
-
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -4582,31 +4598,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
+ flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+ false);
spin_unlock(&kvm->mmu_lock);
/* see kvm_mmu_slot_remove_write_access */
@@ -4620,31 +4616,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_set_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -4741,13 +4716,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
{
/*
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
@@ -4869,15 +4844,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
unsigned int nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ int i;
- slots = kvm_memslots(kvm);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
+ kvm_for_each_memslot(memslot, slots)
+ nr_pages += memslot->npages;
+ }
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ada65ecddcf..398d21c0f6dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,6 +43,7 @@
#define PT_PDPE_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
static inline u64 rsvd_bits(int s, int e)
{
@@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..03d518e499a6 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
return;
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
if (is_error_pfn(pfn))
return;
@@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
gfn_t gfn;
rev_sp = page_header(__pa(sptep));
gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
- if (!gfn_to_memslot(kvm, gfn)) {
+ slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (!slot) {
if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
@@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
if (!*rmapp) {
if (!__ratelimit(&ratelimit_state))
return;
@@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
- rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
- }
}
static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -291,7 +297,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
return 0;
}
-static struct kernel_param_ops audit_param_ops = {
+static const struct kernel_param_ops audit_param_ops = {
.set = mmu_audit_set,
.get = param_get_bool,
};
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index ce463a9cc8fb..5a24b846a1cb 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -2,7 +2,7 @@
#define _TRACE_KVMMMU_H
#include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..dc0a84a6f309
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,719 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return true;
+ case 0x2f8:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+ mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+ if (data & mask) {
+ kvm_inject_gp(vcpu, 0);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+ return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+static u8 mtrr_disabled_type(void)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ */
+ return MTRR_TYPE_UNCACHABLE;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+ u64 start;
+ u64 end;
+
+ int range_shift;
+
+ /* the start position in kvm_mtrr.fixed_ranges[]. */
+ int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+ /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+ {
+ .start = 0x0,
+ .end = 0x80000,
+ .range_shift = 16, /* 64K */
+ .range_start = 0,
+ },
+
+ /*
+ * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+ * 16K fixed mtrr.
+ */
+ {
+ .start = 0x80000,
+ .end = 0xc0000,
+ .range_shift = 14, /* 16K */
+ .range_start = 8,
+ },
+
+ /*
+ * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+ * 4K fixed mtrr.
+ */
+ {
+ .start = 0xc0000,
+ .end = 0x100000,
+ .range_shift = 12, /* 12K */
+ .range_start = 24,
+ }
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+ return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ *seg = 0;
+ *unit = 0;
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+ *unit = msr - MSR_MTRRfix16K_80000;
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+ *unit = msr - MSR_MTRRfix4K_C0000;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+ *start = mtrr_seg->start + unit * unit_size;
+ *end = *start + unit_size;
+ WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+ WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+ > mtrr_seg->end);
+
+ /* each unit has 8 ranges. */
+ return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int n;
+
+ n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return false;
+
+ fixed_mtrr_seg_unit_range(seg, unit, start, end);
+ return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return -1;
+
+ return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+ for (seg = 0; seg < seg_num; seg++) {
+ mtrr_seg = &fixed_seg_table[seg];
+ if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ return seg;
+ }
+
+ return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int index;
+
+ mtrr_seg = &fixed_seg_table[seg];
+ index = mtrr_seg->range_start;
+ index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int pos = index - mtrr_seg->range_start;
+
+ return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+ u64 mask;
+
+ *start = range->base & PAGE_MASK;
+
+ mask = range->mask & PAGE_MASK;
+ mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
+
+ /* This cannot overflow because writing to the reserved bits of
+ * variable MTRRs causes a #GP.
+ */
+ *end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ gfn_t start, end;
+ int index;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+ return;
+
+ /* fixed MTRRs. */
+ if (fixed_msr_to_range(msr, &start, &end)) {
+ if (!fixed_mtrr_is_enabled(mtrr_state))
+ return;
+ } else if (msr == MSR_MTRRdefType) {
+ start = 0x0;
+ end = ~0ULL;
+ } else {
+ /* variable range MTRRs. */
+ index = (msr - 0x200) / 2;
+ var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+ }
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+ return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr_range *tmp, *cur;
+ int index, is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ cur = &mtrr_state->var_ranges[index];
+
+ /* remove the entry if it's in the list. */
+ if (var_mtrr_range_is_valid(cur))
+ list_del(&mtrr_state->var_ranges[index].node);
+
+ if (!is_mtrr_mask)
+ cur->base = data;
+ else
+ cur->mask = data;
+
+ /* add it to the list if it's enabled. */
+ if (var_mtrr_range_is_valid(cur)) {
+ list_for_each_entry(tmp, &mtrr_state->head, node)
+ if (cur->base >= tmp->base)
+ break;
+ list_add_tail(&cur->node, &tmp->node);
+ }
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int index;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+ else if (msr == MSR_MTRRdefType)
+ vcpu->arch.mtrr_state.deftype = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else
+ set_var_mtrr_msr(vcpu, msr, data);
+
+ update_mtrr(vcpu, msr);
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ int index;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+ else if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.deftype;
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ if (!is_mtrr_mask)
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+ else
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+ }
+
+ return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+ INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+ /* input fields. */
+ struct kvm_mtrr *mtrr_state;
+ u64 start;
+ u64 end;
+
+ /* output fields. */
+ int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
+ /* [start, end) is not fully covered in MTRRs? */
+ bool partial_map;
+
+ /* private fields. */
+ union {
+ /* used for fixed MTRRs. */
+ struct {
+ int index;
+ int seg;
+ };
+
+ /* used for var MTRRs. */
+ struct {
+ struct kvm_mtrr_range *range;
+ /* max address has been covered in var MTRRs. */
+ u64 start_max;
+ };
+ };
+
+ bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+ int seg, index;
+
+ if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+ return false;
+
+ seg = fixed_mtrr_addr_to_seg(iter->start);
+ if (seg < 0)
+ return false;
+
+ iter->fixed = true;
+ index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+ iter->index = index;
+ iter->seg = seg;
+ return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+ struct kvm_mtrr_range *range)
+{
+ u64 start, end;
+
+ var_mtrr_range(range, &start, &end);
+ if (!(start >= iter->end || end <= iter->start)) {
+ iter->range = range;
+
+ /*
+ * the function is called when we do kvm_mtrr.head walking.
+ * Range has the minimum base address which interleaves
+ * [looker->start_max, looker->end).
+ */
+ iter->partial_map |= iter->start_max < start;
+
+ /* update the max address has been covered. */
+ iter->start_max = max(iter->start_max, end);
+ return true;
+ }
+
+ return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+ if (match_var_range(iter, iter->range))
+ return;
+
+ iter->range = NULL;
+ iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ iter->fixed = false;
+ iter->start_max = iter->start;
+ iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+ /* terminate the lookup. */
+ if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+ iter->fixed = false;
+ iter->range = NULL;
+ return;
+ }
+
+ iter->index++;
+
+ /* have looked up for all fixed MTRRs. */
+ if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+ return mtrr_lookup_var_start(iter);
+
+ /* switch to next segment. */
+ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+ iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+ if (!mtrr_is_enabled(iter->mtrr_state)) {
+ iter->mtrr_disabled = true;
+ return;
+ }
+
+ if (!mtrr_lookup_fixed_start(iter))
+ mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+ struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+ iter->mtrr_state = mtrr_state;
+ iter->start = start;
+ iter->end = end;
+ iter->mtrr_disabled = false;
+ iter->partial_map = false;
+ iter->fixed = false;
+ iter->range = NULL;
+
+ mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+ if (iter->fixed) {
+ iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+ return true;
+ }
+
+ if (iter->range) {
+ iter->mem_type = iter->range->base & 0xff;
+ return true;
+ }
+
+ return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+ if (iter->fixed)
+ mtrr_lookup_fixed_next(iter);
+ else
+ mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+ for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+ mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+ const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+ | (1 << MTRR_TYPE_WRTHROUGH);
+
+ start = gfn_to_gpa(gfn);
+ end = start + PAGE_SIZE;
+
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ int curr_type = iter.mem_type;
+
+ /*
+ * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+ * Precedences.
+ */
+
+ if (type == -1) {
+ type = curr_type;
+ continue;
+ }
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are identical, then that memory type is
+ * used.
+ */
+ if (type == curr_type)
+ continue;
+
+ /*
+ * If two or more variable memory ranges match and one of
+ * the memory types is UC, the UC memory type used.
+ */
+ if (curr_type == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are WT and WB, the WT memory type is used.
+ */
+ if (((1 << type) & wt_wb_mask) &&
+ ((1 << curr_type) & wt_wb_mask)) {
+ type = MTRR_TYPE_WRTHROUGH;
+ continue;
+ }
+
+ /*
+ * For overlaps not defined by the above rules, processor
+ * behavior is undefined.
+ */
+
+ /* We use WB for this undefined behavior. :( */
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type();
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
+ return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+
+ start = gfn_to_gpa(gfn);
+ end = gfn_to_gpa(gfn + page_num);
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ if (type == -1) {
+ type = iter.mem_type;
+ continue;
+ }
+
+ if (type != iter.mem_type)
+ return false;
+ }
+
+ if (iter.mtrr_disabled)
+ return true;
+
+ if (!iter.partial_map)
+ return true;
+
+ if (type == -1)
+ return true;
+
+ return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe9b5..0f67d7e24800 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (ret)
return ret;
- mark_page_dirty(vcpu->kvm, table_gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
walker->ptes[level] = pte;
}
return 0;
@@ -338,7 +338,7 @@ retry_walk:
real_gfn = gpa_to_gfn(real_gfn);
- host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+ host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
base_gpa = pte_gpa & ~mask;
index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
- r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
curr_pte = gw->prefetch_ptes[index];
} else
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
&curr_pte, sizeof(curr_pte));
return r || curr_pte != gw->ptes[level - 1];
@@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (!rmap_can_add(vcpu))
break;
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
break;
FNAME(update_pte)(vcpu, sp, sptep, &gpte);
@@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
return -EINVAL;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
@@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= FNAME(gpte_access)(vcpu, gpte);
FNAME(protect_clean_gpte)(&pte_access, gpte);
- if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
&nr_present))
continue;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 29fbf9dfdc54..31aa2c85dc97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,11 +1,12 @@
/*
* Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -19,88 +20,39 @@
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
-static struct kvm_arch_event_perf_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
- bool inexact;
-} arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
-
-static bool pmc_is_gp(struct kvm_pmc *pmc)
-{
- return pmc->type == KVM_PMC_GP;
-}
-
-static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-
- return pmu->counter_bitmask[pmc->type];
-}
-
-static inline bool pmc_enabled(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
- u32 base)
-{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
-{
- int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
-{
- return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
-}
-
-static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
-{
- if (idx < INTEL_PMC_IDX_FIXED)
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
- else
- return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
-}
-
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.apic)
- kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
-}
-
-static void trigger_pmi(struct irq_work *irq_work)
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
- struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
- irq_work);
- struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
- arch.pmu);
+ struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
- kvm_deliver_pmi(vcpu);
+ kvm_pmu_deliver_pmi(vcpu);
}
static void kvm_perf_overflow(struct perf_event *perf_event,
@@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
}
static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data, struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
/*
* Inject PMI. If vcpu was in a guest mode during NMI PMI
* can be ejected on a guest mode re-entry. Otherwise we can't
* be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until,
+ * time of vmexit and is not going to re-enter guest mode until
* woken up. So we should wake it, but this is impossible from
* NMI context. Do it from irq work instead.
*/
if (!kvm_is_in_guest())
- irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
else
kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
}
-static u64 read_pmc(struct kvm_pmc *pmc)
-{
- u64 counter, enabled, running;
-
- counter = pmc->counter;
-
- if (pmc->perf_event)
- counter += perf_event_read_value(pmc->perf_event,
- &enabled, &running);
-
- /* FIXME: Scaling needed? */
-
- return counter & pmc_bitmask(pmc);
-}
-
-static void stop_counter(struct kvm_pmc *pmc)
-{
- if (pmc->perf_event) {
- pmc->counter = read_pmc(pmc);
- perf_event_release_kernel(pmc->perf_event);
- pmc->perf_event = NULL;
- }
-}
-
-static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user, bool exclude_kernel,
- bool intr, bool in_tx, bool in_tx_cp)
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+ unsigned config, bool exclude_user,
+ bool exclude_kernel, bool intr,
+ bool in_tx, bool in_tx_cp)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
.exclude_kernel = exclude_kernel,
.config = config,
};
+
if (in_tx)
attr.config |= HSW_IN_TX;
if (in_tx_cp)
@@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
- printk_once("kvm: pmu event creation failed %ld\n",
- PTR_ERR(event));
+ printk_once("kvm_pmu: event creation failed %ld\n",
+ PTR_ERR(event));
return;
}
pmc->perf_event = event;
- clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
-}
-
-static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(arch_events); i++)
- if (arch_events[i].eventsel == event_select
- && arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(arch_events))
- return PERF_COUNT_HW_MAX;
-
- return arch_events[i].event_type;
+ clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
}
-static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
@@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc->eventsel = eventsel;
- stop_counter(pmc);
+ pmc_stop_counter(pmc);
- if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+ if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
return;
event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_CMASK |
- HSW_IN_TX |
- HSW_IN_TX_CHECKPOINTED))) {
- config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
- unit_mask);
+ ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_CMASK |
+ HSW_IN_TX |
+ HSW_IN_TX_CHECKPOINTED))) {
+ config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+ event_select,
+ unit_mask);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (type == PERF_TYPE_RAW)
config = eventsel & X86_RAW_EVENT_MASK;
- reprogram_counter(pmc, type, config,
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT,
- (eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ pmc_reprogram_counter(pmc, type, config,
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT,
+ (eventsel & HSW_IN_TX),
+ (eventsel & HSW_IN_TX_CHECKPOINTED));
}
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
-static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
{
- unsigned en = en_pmi & 0x3;
- bool pmi = en_pmi & 0x8;
+ unsigned en_field = ctrl & 0x3;
+ bool pmi = ctrl & 0x8;
- stop_counter(pmc);
+ pmc_stop_counter(pmc);
- if (!en || !pmc_enabled(pmc))
+ if (!en_field || !pmc_is_enabled(pmc))
return;
- reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- arch_events[fixed_pmc_events[idx]].event_type,
- !(en & 0x2), /* exclude user */
- !(en & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+ kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+ !(en_field & 0x2), /* exclude user */
+ !(en_field & 0x1), /* exclude kernel */
+ pmi, false, false);
}
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
-static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
{
- return (ctrl >> (idx * 4)) & 0xf;
-}
-
-static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
-{
- int i;
-
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- u8 en_pmi = fixed_en_pmi(data, i);
- struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
-
- if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
- continue;
-
- reprogram_fixed_counter(pmc, en_pmi, i);
- }
-
- pmu->fixed_ctr_ctrl = data;
-}
-
-static void reprogram_idx(struct kvm_pmu *pmu, int idx)
-{
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
if (!pmc)
return;
@@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
if (pmc_is_gp(pmc))
reprogram_gp_counter(pmc, pmc->eventsel);
else {
- int fidx = idx - INTEL_PMC_IDX_FIXED;
- reprogram_fixed_counter(pmc,
- fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+ int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+ reprogram_fixed_counter(pmc, ctrl, idx);
}
}
+EXPORT_SYMBOL_GPL(reprogram_counter);
-static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 bitmask;
int bit;
- u64 diff = pmu->global_ctrl ^ data;
-
- pmu->global_ctrl = data;
-
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- reprogram_idx(pmu, bit);
-}
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int ret;
-
- switch (msr) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- case MSR_CORE_PERF_GLOBAL_STATUS:
- case MSR_CORE_PERF_GLOBAL_CTRL:
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- ret = pmu->version > 1;
- break;
- default:
- ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
- || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
- || get_fixed_pmc(pmu, msr);
- break;
- }
- return ret;
-}
+ bitmask = pmu->reprogram_pmi;
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
+ for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- *data = pmu->fixed_ctr_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- *data = pmu->global_status;
- return 0;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- *data = pmu->global_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- *data = pmu->global_ovf_ctrl;
- return 0;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- *data = read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- *data = pmc->eventsel;
- return 0;
+ if (unlikely(!pmc || !pmc->perf_event)) {
+ clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ continue;
}
- }
- return 1;
-}
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
- u32 index = msr_info->index;
- u64 data = msr_info->data;
-
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & 0xfffffffffffff444ull)) {
- reprogram_fixed_counters(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- if (msr_info->host_initiated) {
- pmu->global_status = data;
- return 0;
- }
- break; /* RO MSR */
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (!(data & pmu->global_ctrl_mask)) {
- global_ctrl_changed(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
- return 0;
- }
- break;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- if (!msr_info->host_initiated)
- data = (s64)(s32)data;
- pmc->counter += data - read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
- reprogram_gp_counter(pmc, data);
- return 0;
- }
- }
+ reprogram_counter(pmu, bit);
}
- return 1;
}
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- bool fixed = pmc & (1u << 30);
- pmc &= ~(3u << 30);
- return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
- (fixed && pmc >= pmu->nr_arch_fixed_counters);
+ return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
}
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- bool fast_mode = pmc & (1u << 31);
- bool fixed = pmc & (1u << 30);
- struct kvm_pmc *counters;
- u64 ctr;
-
- pmc &= ~(3u << 30);
- if (!fixed && pmc >= pmu->nr_arch_gp_counters)
- return 1;
- if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+ bool fast_mode = idx & (1u << 31);
+ struct kvm_pmc *pmc;
+ u64 ctr_val;
+
+ pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
+ if (!pmc)
return 1;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
- ctr = read_pmc(&counters[pmc]);
+
+ ctr_val = pmc_read_counter(pmc);
if (fast_mode)
- ctr = (u32)ctr;
- *data = ctr;
+ ctr_val = (u32)ctr_val;
+ *data = ctr_val;
return 0;
}
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_cpuid_entry2 *entry;
- union cpuid10_eax eax;
- union cpuid10_edx edx;
-
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
-
- entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
- return;
- eax.full = entry->eax;
- edx.full = entry->edx;
-
- pmu->version = eax.split.version_id;
- if (!pmu->version)
- return;
-
- pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- INTEL_PMC_MAX_GENERIC);
- pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
- pmu->available_event_types = ~entry->ebx &
- ((1ull << eax.split.mask_length) - 1);
-
- if (pmu->version == 1) {
- pmu->nr_arch_fixed_counters = 0;
- } else {
- pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- INTEL_PMC_MAX_FIXED);
- pmu->counter_bitmask[KVM_PMC_FIXED] =
- ((u64)1 << edx.split.bit_width_fixed) - 1;
- }
+ if (vcpu->arch.apic)
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
- pmu->global_ctrl_mask = ~pmu->global_ctrl;
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
- entry = kvm_find_cpuid_entry(vcpu, 7, 0);
- if (entry &&
- (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
- (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
- pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
}
-void kvm_pmu_init(struct kvm_vcpu *vcpu)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
- memset(pmu, 0, sizeof(*pmu));
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
- pmu->gp_counters[i].type = KVM_PMC_GP;
- pmu->gp_counters[i].vcpu = vcpu;
- pmu->gp_counters[i].idx = i;
- }
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
- pmu->fixed_counters[i].type = KVM_PMC_FIXED;
- pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
- }
- init_irq_work(&pmu->irq_work, trigger_pmi);
- kvm_pmu_cpuid_update(vcpu);
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->pmu_ops->refresh(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
irq_work_sync(&pmu->irq_work);
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
- struct kvm_pmc *pmc = &pmu->gp_counters[i];
- stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
- }
+ kvm_x86_ops->pmu_ops->reset(vcpu);
+}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
- stop_counter(&pmu->fixed_counters[i]);
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_x86_ops->pmu_ops->init(vcpu);
+ init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ kvm_pmu_refresh(vcpu);
}
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_pmu_reset(vcpu);
}
-
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- u64 bitmask;
- int bit;
-
- bitmask = pmu->reprogram_pmi;
-
- for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
-
- if (unlikely(!pmc || !pmc->perf_event)) {
- clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
- continue;
- }
-
- reprogram_idx(pmu, bit);
- }
-}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..f96e1f962587
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,118 @@
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+struct kvm_event_hw_type_mapping {
+ u8 eventsel;
+ u8 unit_mask;
+ unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+ unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+ u8 unit_mask);
+ unsigned (*find_fixed_event)(int idx);
+ bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+ struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
+ int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[msr - base];
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[msr - base];
+
+ return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000000..886aa25a7131
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,207 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+ if (amd_event_mapping[i].eventsel == event_select
+ && amd_event_mapping[i].unit_mask == unit_mask)
+ break;
+
+ if (i == ARRAY_SIZE(amd_event_mapping))
+ return PERF_COUNT_HW_MAX;
+
+ return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+ return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ counters = pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret = false;
+
+ ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+
+ return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /* MSR_K7_PERFCTRn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ if (pmc) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_K7_EVNTSELn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ if (pmc) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_K7_PERFCTRn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ if (pmc) {
+ if (!msr_info->host_initiated)
+ data = (s64)data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_K7_EVNTSELn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ if (pmc) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->version = 0;
+ pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+ .find_arch_event = amd_find_arch_event,
+ .find_fixed_event = amd_find_fixed_event,
+ .pmc_is_enabled = amd_pmc_is_enabled,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000000..ab38af4f4947
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,358 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+ /* Index must match CPUID 0x0A.EBX bit vector */
+ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
+ [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+ [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+ [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ reprogram_fixed_counter(pmc, new_ctrl, i);
+ }
+
+ pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 diff = pmu->global_ctrl ^ data;
+
+ pmu->global_ctrl = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+ if (intel_arch_events[i].eventsel == event_select
+ && intel_arch_events[i].unit_mask == unit_mask
+ && (pmu->available_event_types & (1 << i)))
+ break;
+
+ if (i == ARRAY_SIZE(intel_arch_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+ if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[fixed_pmc_events[idx]].event_type;
+}
+
+/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED)
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+
+ idx &= ~(3u << 30);
+
+ return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+ (fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (!fixed && idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ return NULL;
+ counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ ret = pmu->version > 1;
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ *data = pmu->fixed_ctr_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ *data = pmu->global_status;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ *data = pmu->global_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *data = pmu->global_ovf_ctrl;
+ return 0;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, msr))) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+ if (!(data & 0xfffffffffffff444ull)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (msr_info->host_initiated) {
+ pmu->global_status = data;
+ return 0;
+ }
+ break; /* RO MSR */
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (pmu->global_ctrl == data)
+ return 0;
+ if (!(data & pmu->global_ctrl_mask)) {
+ global_ctrl_changed(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ pmu->global_ovf_ctrl = data;
+ return 0;
+ }
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, msr))) {
+ if (!msr_info->host_initiated)
+ data = (s64)(s32)data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ INTEL_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
+
+ if (pmu->version == 1) {
+ pmu->nr_arch_fixed_counters = 0;
+ } else {
+ pmu->nr_arch_fixed_counters =
+ min_t(int, edx.split.num_counters_fixed,
+ INTEL_PMC_MAX_FIXED);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
+ }
+
+ pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+ pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+ entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+ pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ }
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+ pmc_stop_counter(&pmu->fixed_counters[i]);
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+ pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+ .find_arch_event = intel_find_arch_event,
+ .find_fixed_event = intel_find_fixed_event,
+ .pmc_is_enabled = intel_pmc_is_enabled,
+ .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9afa233b5482..8e0c0844c6b9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -21,6 +21,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
+#include "pmu.h"
#include <linux/module.h>
#include <linux/mod_devicetable.h>
@@ -28,7 +29,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/slab.h>
#include <asm/perf_event.h>
@@ -511,8 +512,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->vmcb->control.next_rip != 0)
+ if (svm->vmcb->control.next_rip != 0) {
+ WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
+ }
if (!svm->next_rip) {
if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
@@ -862,6 +865,64 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+#define MTRR_TYPE_UC_MINUS 7
+#define MTRR2PROTVAL_INVALID 0xff
+
+static u8 mtrr2protval[8];
+
+static u8 fallback_mtrr_type(int mtrr)
+{
+ /*
+ * WT and WP aren't always available in the host PAT. Treat
+ * them as UC and UC- respectively. Everything else should be
+ * there.
+ */
+ switch (mtrr)
+ {
+ case MTRR_TYPE_WRTHROUGH:
+ return MTRR_TYPE_UNCACHABLE;
+ case MTRR_TYPE_WRPROT:
+ return MTRR_TYPE_UC_MINUS;
+ default:
+ BUG();
+ }
+}
+
+static void build_mtrr2protval(void)
+{
+ int i;
+ u64 pat;
+
+ for (i = 0; i < 8; i++)
+ mtrr2protval[i] = MTRR2PROTVAL_INVALID;
+
+ /* Ignore the invalid MTRR types. */
+ mtrr2protval[2] = 0;
+ mtrr2protval[3] = 0;
+
+ /*
+ * Use host PAT value to figure out the mapping from guest MTRR
+ * values to nested page table PAT/PCD/PWT values. We do not
+ * want to change the host PAT value every time we enter the
+ * guest.
+ */
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ for (i = 0; i < 8; i++) {
+ u8 mtrr = pat >> (8 * i);
+
+ if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID)
+ mtrr2protval[mtrr] = __cm_idx2pte(i);
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) {
+ u8 fallback = fallback_mtrr_type(i);
+ mtrr2protval[i] = mtrr2protval[fallback];
+ BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID);
+ }
+ }
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -928,6 +989,7 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();
+ build_mtrr2protval();
return 0;
err:
@@ -1082,7 +1144,40 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
return target_tsc - tsc;
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ /* Unlike Intel, AMD takes the guest's CR0.CD into account.
+ *
+ * AMD does not have IPAT. To emulate it for the case of guests
+ * with no assigned devices, just set everything to WB. If guests
+ * have assigned devices, however, we cannot force WB for RAM
+ * pages only, so use the guest PAT directly.
+ */
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ *g_pat = 0x0606060606060606;
+ else
+ *g_pat = vcpu->arch.pat;
+}
+
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u8 mtrr;
+
+ /*
+ * 1. MMIO: trust guest MTRR, so same as item 3.
+ * 2. No passthrough: always map as WB, and force guest PAT to WB as well
+ * 3. Passthrough: can't guarantee the result, try to trust guest.
+ */
+ if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return mtrr2protval[mtrr];
+}
+
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1248,17 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_efer(&svm->vcpu, 0);
+ if (!init_event)
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
- * This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ * It also updates the guest-visible cr0 value.
*/
- svm->vcpu.arch.cr0 = 0;
(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1271,8 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = 0x0007040600070406ULL;
+ save->g_pat = svm->vcpu.arch.pat;
+ svm_set_guest_pat(svm, &save->g_pat);
save->cr3 = 0;
save->cr4 = 0;
}
@@ -1195,13 +1291,19 @@ static void init_vmcb(struct vcpu_svm *svm)
enable_gif(svm);
}
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 dummy;
u32 eax = 1;
- init_vmcb(svm);
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+ init_vmcb(svm, init_event);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1359,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
- init_vmcb(svm);
-
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ init_vmcb(svm, false);
svm_init_osvw(&svm->vcpu);
@@ -1575,7 +1672,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* does not do it - this results in some delay at
* reboot
*/
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
@@ -1883,7 +1981,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(svm, false);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -1953,8 +2051,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
- offset_in_page(cr3) + index * 8, 8);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
return pdpte;
@@ -2112,7 +2210,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
might_sleep();
- page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+ page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
@@ -2151,7 +2249,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
mask = (0xf >> (4 - size)) << start_bit;
val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
return NESTED_EXIT_DONE;
return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2176,7 +2274,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
/* Offset is in 32 bit units but need in 8 bit units */
offset *= 4;
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
return NESTED_EXIT_DONE;
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2447,7 +2545,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
p = msrpm_offsets[i];
offset = svm->nested.vmcb_msrpm + (p * 4);
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
return false;
svm->nested.msrpm[p] = svm->msrpm[p] | value;
@@ -3067,42 +3165,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
svm_scale_tsc(vcpu, host_tsc);
}
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
- switch (ecx) {
+ switch (msr_info->index) {
case MSR_IA32_TSC: {
- *data = svm->vmcb->control.tsc_offset +
+ msr_info->data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
break;
}
case MSR_STAR:
- *data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb->save.lstar;
break;
case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->sysenter_eip;
+ msr_info->data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->sysenter_esp;
+ msr_info->data = svm->sysenter_esp;
break;
/*
* Nobody will change the following 5 values in the VMCB so we can
@@ -3110,31 +3208,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
* implemented.
*/
case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
- *data = svm->nested.hsave_msr;
+ msr_info->data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
+ msr_info->data = svm->nested.vm_cr_msr;
break;
case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
+ msr_info->data = 0x01000065;
break;
default:
- return kvm_get_msr_common(vcpu, ecx, data);
+ return kvm_get_msr_common(vcpu, msr_info);
}
return 0;
}
@@ -3142,16 +3240,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
static int rdmsr_interception(struct vcpu_svm *svm)
{
u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
- u64 data;
+ struct msr_data msr_info;
- if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (svm_get_msr(&svm->vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
} else {
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+ msr_info.data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+ msr_info.data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
@@ -3245,6 +3347,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_CR_PAT:
+ if (npt_enabled) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm_set_guest_pat(svm, &svm->vmcb->save.g_pat);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ }
+ /* fall through */
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -3388,6 +3500,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_RSM] = emulate_on_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4073,9 +4186,9 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static bool svm_has_high_real_mode_segbase(void)
{
- return 0;
+ return true;
}
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
@@ -4317,7 +4430,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
break;
}
- vmcb->control.next_rip = info->next_rip;
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
vmcb->control.exit_code = icpt_info.exit_code;
vmexit = nested_svm_exit_handled(svm);
@@ -4346,6 +4461,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
@@ -4440,6 +4556,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.handle_external_intr = svm_handle_external_intr,
.sched_in = svm_sched_in,
+
+ .pmu_ops = &amd_pmu_ops,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..4eae7c35ddf5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
+TRACE_EVENT(kvm_enter_smm,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e11dd59398f1..83b7b5cd75d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -28,7 +28,7 @@
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
@@ -47,6 +47,7 @@
#include <asm/apic.h>
#include "trace.h"
+#include "pmu.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
@@ -785,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
- struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+ struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
if (is_error_page(page))
return NULL;
@@ -2169,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_nested;
- else if (irqchip_in_kernel(vcpu->kvm) &&
- apic_x2apic_mode(vcpu->arch.apic)) {
+ else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
@@ -2622,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- u64 data;
struct shared_msr_entry *msr;
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
+ switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
vmx_load_host_state(to_vmx(vcpu));
- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- data = guest_read_tsc();
+ msr_info->data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
if (!vmx_mpx_supported())
return 1;
- data = vmcs_read64(GUEST_BNDCFGS);
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_FEATURE_CONTROL:
if (!nested_vmx_allowed(vcpu))
return 1;
- data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+ return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
- data = vcpu->arch.ia32_xss;
+ msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
/* Otherwise falls through */
default:
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
+ msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
if (msr) {
- data = msr->data;
+ msr_info->data = msr->data;
break;
}
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
}
- *pdata = data;
return 0;
}
@@ -4122,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+ r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
if (r)
goto out;
@@ -4157,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+ r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
return r;
}
@@ -4666,16 +4659,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- u32 msr_low, msr_high;
- u64 host_pat;
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
@@ -4707,22 +4692,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
return 0;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct msr_data apic_base_msr;
+ u64 cr0;
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(&vmx->vcpu, 0);
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+ kvm_set_cr8(vcpu, 0);
+
+ if (!init_event) {
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
+ }
vmx_segment_cache_clear(vmx);
@@ -4746,9 +4736,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ if (!init_event) {
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ }
vmcs_writel(GUEST_RFLAGS, 0x02);
kvm_rip_write(vcpu, 0xfff0);
@@ -4763,18 +4756,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
setup_msrs(vmx);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow()) {
+ if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+ if (vm_need_tpr_shadow(vcpu->kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vmx->vcpu.arch.apic->regs));
+ __pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
}
@@ -4786,12 +4776,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
- vmx_set_cr4(&vmx->vcpu, 0);
- vmx_set_efer(&vmx->vcpu, 0);
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
+ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
+ vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, 0);
+ if (!init_event)
+ vmx_set_efer(vcpu, 0);
+ vmx_fpu_activate(vcpu);
+ update_exception_bitmap(vcpu);
vpid_sync_context(vmx);
}
@@ -4964,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
.flags = 0,
};
- ret = kvm_set_memory_region(kvm, &tss_mem);
+ ret = x86_set_memory_region(kvm, &tss_mem);
if (ret)
return ret;
kvm->arch.tss_addr = addr;
@@ -5474,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
static int handle_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data;
+ struct msr_data msr_info;
- if (vmx_get_msr(vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (vmx_get_msr(vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
/* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
skip_emulated_instruction(vcpu);
return 1;
}
@@ -5709,9 +5703,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
return 0;
}
- /* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
/*
* TODO: What about debug traps on tss switch?
* Are we supposed to inject them and update dr6?
@@ -7332,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
bitmap += (port & 0x7fff) / 8;
if (last_bitmap != bitmap)
- if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
return true;
if (b & (1 << (port & 7)))
return true;
@@ -7376,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
/* Then read the msr_index'th bit from this bitmap: */
if (msr_index < 1024*8) {
unsigned char b;
- if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
return true;
return 1 & (b >> (msr_index & 7));
} else
@@ -7641,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
-static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vmx->vcpu.kvm;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 *pml_buf;
u16 pml_idx;
@@ -7665,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
gpa = pml_buf[pml_idx];
WARN_ON(gpa & (PAGE_SIZE - 1));
- mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
@@ -7690,6 +7681,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
kvm_vcpu_kick(vcpu);
}
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+ u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ u32 secondary_exec_control = 0;
+ unsigned long cr4 = vmcs_readl(GUEST_CR4);
+ u64 efer = vmcs_readl(GUEST_IA32_EFER);
+ int i, n;
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+ {
+ pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+ pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
+ efer, vmcs_readl(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+ pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ pr_err("*** Control State ***\n");
+ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+ pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+ n = vmcs_read32(CR3_TARGET_COUNT);
+ for (i = 0; i + 1 < n; i += 4)
+ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+ if (i < n)
+ pr_err("CR3 target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7708,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
* flushed already.
*/
if (enable_pml)
- vmx_flush_pml_buffer(vmx);
+ vmx_flush_pml_buffer(vcpu);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
@@ -7722,6 +7865,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
@@ -7995,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
local_irq_enable();
}
+static bool vmx_has_high_real_mode_segbase(void)
+{
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
static bool vmx_mpx_supported(void)
{
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -8479,29 +8628,39 @@ static int get_ept_level(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u64 ret;
+ u8 cache;
+ u64 ipat = 0;
/* For VT-d and EPT combination
- * 1. MMIO: always map as UC
+ * 1. MMIO: guest may want to apply WC, trust it.
* 2. EPT with VT-d:
* a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
+ * result, try to trust guest. So the same as item 1.
* b. VT-d with snooping control feature: snooping control feature of
* VT-d engine can guarantee the cache correctness. Just set it
* to WB to keep consistent with host. So the same as item 3.
* 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
- if (is_mmio)
- ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
- ret = kvm_get_guest_memory_type(vcpu, gfn) <<
- VMX_EPT_MT_EPTE_SHIFT;
- else
- ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IPAT_BIT;
+ if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ ipat = VMX_EPT_IPAT_BIT;
+ cache = MTRR_TYPE_WRBACK;
+ goto exit;
+ }
- return ret;
+ if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ ipat = VMX_EPT_IPAT_BIT;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
static int vmx_get_lpage_level(void)
@@ -8923,7 +9082,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
/* x2APIC MSR accesses are not allowed */
- if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
return -EINVAL;
if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
e->index == MSR_IA32_UCODE_REV)
@@ -8965,8 +9124,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr.host_initiated = false;
for (i = 0; i < count; i++) {
- if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
- &e, sizeof(e))) {
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
pr_warn_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
@@ -8998,9 +9157,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
struct vmx_msr_entry e;
for (i = 0; i < count; i++) {
- if (kvm_read_guest(vcpu->kvm,
- gpa + i * sizeof(e),
- &e, 2 * sizeof(u32))) {
+ struct msr_data msr_info;
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
pr_warn_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
@@ -9012,19 +9172,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
return -EINVAL;
}
- if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ msr_info.host_initiated = false;
+ msr_info.index = e.index;
+ if (kvm_get_msr(vcpu, &msr_info)) {
pr_warn_ratelimited(
"%s cannot read MSR (%u, 0x%x)\n",
__func__, i, e.index);
return -EINVAL;
}
- if (kvm_write_guest(vcpu->kvm,
- gpa + i * sizeof(e) +
- offsetof(struct vmx_msr_entry, value),
- &e.value, sizeof(e.value))) {
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &msr_info.data, sizeof(msr_info.data))) {
pr_warn_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
- __func__, i, e.index, e.value);
+ __func__, i, e.index, msr_info.data);
return -EINVAL;
}
}
@@ -10149,6 +10311,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
+ .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
@@ -10254,6 +10417,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+
+ .pmu_ops = &intel_pmu_ops,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26eaeb522cab..5ef2560075bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -28,6 +28,7 @@
#include "x86.h"
#include "cpuid.h"
#include "assigned-dev.h"
+#include "pmu.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -57,7 +58,6 @@
#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/desc.h>
-#include <asm/mtrr.h>
#include <asm/mce.h>
#include <linux/kernel_stat.h>
#include <asm/fpu/internal.h> /* Ugh! */
@@ -98,6 +98,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
@@ -474,7 +477,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
/*
* This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
* can read from guest physical or from the guest's guest physical memory.
*/
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -492,7 +495,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
real_gfn = gpa_to_gfn(real_gfn);
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+ return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
@@ -571,8 +574,7 @@ out:
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -618,6 +620,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
+
+ if ((cr0 ^ old_cr0) & X86_CR0_CD)
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
@@ -907,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
u64 data;
int err;
- err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+ err = kvm_pmu_rdpmc(vcpu, ecx, &data);
if (err)
return err;
kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
@@ -922,17 +928,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
*/
-#define KVM_SAVE_MSRS_BEGIN 12
static u32 msrs_to_save[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -944,14 +944,24 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
+
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_SMBASE,
};
+static unsigned num_emulated_msrs;
+
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits)
@@ -1045,6 +1055,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = index;
+ msr.host_initiated = true;
+ r = kvm_get_msr(vcpu, &msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+ return 0;
+}
+
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct msr_data msr;
@@ -1697,6 +1722,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->pvclock_set_guest_stopped_request = false;
}
+ pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
+
/* If the host uses TSC clocksource, then it is stable */
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1767,127 +1794,14 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
-static bool msr_mtrr_valid(unsigned msr)
-{
- switch (msr) {
- case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
- case MSR_MTRRfix64K_00000:
- case MSR_MTRRfix16K_80000:
- case MSR_MTRRfix16K_A0000:
- case MSR_MTRRfix4K_C0000:
- case MSR_MTRRfix4K_C8000:
- case MSR_MTRRfix4K_D0000:
- case MSR_MTRRfix4K_D8000:
- case MSR_MTRRfix4K_E0000:
- case MSR_MTRRfix4K_E8000:
- case MSR_MTRRfix4K_F0000:
- case MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- case MSR_IA32_CR_PAT:
- return true;
- case 0x2f8:
- return true;
- }
- return false;
-}
-
-static bool valid_pat_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
-static bool valid_mtrr_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
-}
-
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- int i;
- u64 mask;
-
- if (!msr_mtrr_valid(msr))
- return false;
-
- if (msr == MSR_IA32_CR_PAT) {
- for (i = 0; i < 8; i++)
- if (!valid_pat_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- } else if (msr == MSR_MTRRdefType) {
- if (data & ~0xcff)
- return false;
- return valid_mtrr_type(data & 0xff);
- } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
- for (i = 0; i < 8 ; i++)
- if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- }
-
- /* variable MTRRs */
- WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
-
- mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
- if ((msr & 1) == 0) {
- /* MTRR base */
- if (!valid_mtrr_type(data & 0xff))
- return false;
- mask |= 0xf00;
- } else
- /* MTRR mask */
- mask |= 0x7ff;
- if (data & mask) {
- kvm_inject_gp(vcpu, 0);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
-
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!kvm_mtrr_valid(vcpu, msr, data))
- return 1;
-
- if (msr == MSR_MTRRdefType) {
- vcpu->arch.mtrr_state.def_type = data;
- vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
- } else if (msr == MSR_MTRRfix64K_00000)
- p[0] = data;
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- p[1 + msr - MSR_MTRRfix16K_80000] = data;
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- p[3 + msr - MSR_MTRRfix4K_C0000] = data;
- else if (msr == MSR_IA32_CR_PAT)
- vcpu->arch.pat = data;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pt = data;
- }
-
- kvm_mmu_reset_context(vcpu);
- return 0;
-}
-
static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -1946,7 +1860,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
r = PTR_ERR(page);
goto out;
}
- if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
out_free:
@@ -2046,13 +1960,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
}
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(vcpu->kvm, gfn);
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
if (__clear_user((void __user *)addr, PAGE_SIZE))
return 1;
vcpu->arch.hv_vapic = data;
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
return 1;
break;
@@ -2179,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
__func__, data);
break;
case 0x200 ... 0x2ff:
- return set_msr_mtrr(vcpu, msr, data);
+ return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
@@ -2199,6 +2113,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
@@ -2219,6 +2138,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&vcpu->requests);
ka->boot_vcpu_runs_old_kvmclock = tmp;
+
+ ka->kvmclock_offset = -get_kernel_ns();
}
vcpu->arch.time = data;
@@ -2280,37 +2201,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return set_msr_mce(vcpu, msr, data);
- /* Performance counters are not protected by a CPUID bit,
- * so we should check all of them in the generic path for the sake of
- * cross vendor migration.
- * Writing a zero into the event select MSRs disables them,
- * which we perfectly emulate ;-). Any other value should be at least
- * reported, some guests depend on them.
- */
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- if (data != 0)
- vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- /* at least RHEL 4 unconditionally writes to the perfctr registers,
- * so we ignore writes to make it happy.
- */
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- pr = true;
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ pr = true; /* fall through */
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (pr || data != 0)
@@ -2356,7 +2252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
- if (kvm_pmu_msr(vcpu, msr))
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (!ignore_msrs) {
vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
@@ -2378,48 +2274,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+ return kvm_x86_ops->get_msr(vcpu, msr);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!msr_mtrr_valid(msr))
- return 1;
-
- if (msr == MSR_MTRRdefType)
- *pdata = vcpu->arch.mtrr_state.def_type +
- (vcpu->arch.mtrr_state.enabled << 10);
- else if (msr == MSR_MTRRfix64K_00000)
- *pdata = p[0];
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
- else if (msr == MSR_IA32_CR_PAT)
- *pdata = vcpu->arch.pat;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pdata = *pt;
- }
-
- return 0;
-}
-
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -2517,11 +2377,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- u64 data;
-
- switch (msr) {
+ switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -2532,38 +2390,28 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K8_SYSCFG:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
- data = 0;
+ msr_info->data = 0;
break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
- data = 0;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
- data = 0x100000000ULL;
+ msr_info->data = 0x100000000ULL;
break;
case MSR_MTRRcap:
- data = 0x500 | KVM_NR_VAR_MTRR;
- break;
case 0x200 ... 0x2ff:
- return get_msr_mtrr(vcpu, msr, pdata);
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
- data = 3;
+ msr_info->data = 3;
break;
/*
* MSR_EBC_FREQUENCY_ID
@@ -2577,48 +2425,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* multiplying by zero otherwise.
*/
case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
+ msr_info->data = 1 << 24;
break;
case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
+ msr_info->data = kvm_get_apic_base(vcpu);
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
- return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
- data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
- data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
- data = vcpu->arch.ia32_misc_enable_msr;
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
- data = 1000ULL;
+ msr_info->data = 1000ULL;
/* CPU multiplier */
- data |= (((uint64_t)4ULL) << 40);
+ msr_info->data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.efer;
+ msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
case MSR_KVM_WALL_CLOCK_NEW:
- data = vcpu->kvm->arch.wall_clock;
+ msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
case MSR_KVM_SYSTEM_TIME_NEW:
- data = vcpu->arch.time;
+ msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
+ msr_info->data = vcpu->arch.apf.msr_val;
break;
case MSR_KVM_STEAL_TIME:
- data = vcpu->arch.st.msr_val;
+ msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
- data = vcpu->arch.pv_eoi.msr_val;
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
@@ -2626,7 +2479,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr, pdata);
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2637,17 +2490,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* type 6, model 8 and higher from exploding due to
* the rdmsr failing.
*/
- data = 0x20000000;
+ msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
+ if (kvm_hv_msr_partition_wide(msr_info->index)) {
int r;
mutex_lock(&vcpu->kvm->lock);
- r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return get_msr_hyperv(vcpu, msr, pdata);
+ return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
@@ -2660,31 +2513,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* L2 cache control register 3: 64GB range, 256KB size,
* enabled, latency 0x1, configured
*/
- data = 0xbe702111;
+ msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.length;
+ msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.status;
+ msr_info->data = vcpu->arch.osvw.status;
break;
default:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
if (!ignore_msrs) {
- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
return 1;
} else {
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
- data = 0;
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+ msr_info->data = 0;
}
break;
}
- *pdata = data;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -2797,12 +2649,25 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
#endif
r = 1;
break;
+ case KVM_CAP_X86_SMM:
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+ break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
@@ -2859,7 +2724,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -2871,7 +2736,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
@@ -3015,6 +2880,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+
+ return 0;
+}
+
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -3120,8 +2992,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->sipi_vector = 0; /* never valid when reporting to user space */
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SHADOW);
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -3130,7 +3009,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM))
return -EINVAL;
process_nmi(vcpu);
@@ -3155,6 +3035,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_vcpu_has_lapic(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ if (events->smi.smm)
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ vcpu->arch.smi_pending = events->smi.pending;
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (kvm_vcpu_has_lapic(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
@@ -3259,8 +3157,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
memcpy(dest, src + offset, size);
- } else
- WARN_ON_ONCE(1);
+ }
valid -= feature;
}
@@ -3414,6 +3311,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_nmi(vcpu);
break;
}
+ case KVM_SMI: {
+ r = kvm_vcpu_ioctl_smi(vcpu);
+ break;
+ }
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
@@ -3453,7 +3354,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
break;
case KVM_SET_MSRS:
r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -3844,6 +3745,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
return 0;
}
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4096,7 +4017,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
@@ -4109,8 +4038,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i, j;
- /* skip the first msrs in the list. KVM-specific */
- for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
@@ -4135,6 +4063,22 @@ static void kvm_init_msr_list(void)
j++;
}
num_msrs_to_save = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+ switch (emulated_msrs[i]) {
+ case MSR_IA32_SMBASE:
+ if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (j < i)
+ emulated_msrs[j] = emulated_msrs[i];
+ j++;
+ }
+ num_emulated_msrs = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4252,8 +4196,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
- offset, toread);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4286,8 +4230,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
offset = addr & (PAGE_SIZE-1);
if (WARN_ON(offset + bytes > PAGE_SIZE))
bytes = (unsigned)PAGE_SIZE - offset;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
- offset, bytes);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
if (unlikely(ret < 0))
return X86EMUL_IO_NEEDED;
@@ -4333,7 +4277,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4386,7 +4330,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4420,7 +4364,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
}
static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4618,7 +4562,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto emul_write;
@@ -4646,7 +4590,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
- mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -4945,7 +4889,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct msr_data msr;
+ int r;
+
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+ if (r)
+ return r;
+
+ *pdata = msr.data;
+ return 0;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
@@ -4959,16 +4913,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.smbase = smbase;
+}
+
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
- return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
}
static void emulator_halt(struct x86_emulate_ctxt *ctxt)
@@ -5044,6 +5012,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
+ .get_smbase = emulator_get_smbase,
+ .set_smbase = emulator_set_smbase,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
@@ -5105,7 +5075,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- ctxt->guest_mode = is_guest_mode(vcpu);
+ BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+ BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5274,6 +5247,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+ /* This is a good place to trace that we are exiting SMM. */
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+ if (unlikely(vcpu->arch.smi_pending)) {
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ vcpu->arch.smi_pending = 0;
+ } else {
+ /* Process a latched INIT, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+ vcpu->arch.hflags = emul_flags;
+
+ if (changed & HF_SMM_MASK)
+ kvm_smm_changed(vcpu);
+}
+
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -5473,6 +5474,8 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (vcpu->arch.hflags != ctxt->emul_flags)
+ kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -5951,6 +5954,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6038,6 +6042,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
@@ -6161,6 +6166,233 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
+static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+
+ kvm_get_segment(vcpu, &seg, n);
+ put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ put_smstate(u32, buf, offset + 8, seg.base);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+}
+
+static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+ u16 flags;
+
+ kvm_get_segment(vcpu, &seg, n);
+ offset = 0x7e00 + n * 16;
+
+ flags = process_smi_get_segment_flags(&seg) >> 8;
+ put_smstate(u16, buf, offset, seg.selector);
+ put_smstate(u16, buf, offset + 2, flags);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u64, buf, offset + 8, seg.base);
+}
+
+static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+ put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+ put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+ put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+ for (i = 0; i < 8; i++)
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u32, buf, 0x7fcc, (u32)val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u32, buf, 0x7fc4, seg.selector);
+ put_smstate(u32, buf, 0x7f64, seg.base);
+ put_smstate(u32, buf, 0x7f60, seg.limit);
+ put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u32, buf, 0x7fc0, seg.selector);
+ put_smstate(u32, buf, 0x7f80, seg.base);
+ put_smstate(u32, buf, 0x7f7c, seg.limit);
+ put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f74, dt.address);
+ put_smstate(u32, buf, 0x7f70, dt.size);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f58, dt.address);
+ put_smstate(u32, buf, 0x7f54, dt.size);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_32(vcpu, buf, i);
+
+ put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020000);
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+#ifdef CONFIG_X86_64
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+ put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+ put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u64, buf, 0x7f68, val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u64, buf, 0x7f60, val);
+
+ put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+ put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+ put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+ put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+ put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u16, buf, 0x7e90, seg.selector);
+ put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e94, seg.limit);
+ put_smstate(u64, buf, 0x7e98, seg.base);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e84, dt.size);
+ put_smstate(u64, buf, 0x7e88, dt.address);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u16, buf, 0x7e70, seg.selector);
+ put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e74, seg.limit);
+ put_smstate(u64, buf, 0x7e78, seg.base);
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e64, dt.size);
+ put_smstate(u64, buf, 0x7e68, dt.address);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_64(vcpu, buf, i);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ char buf[512];
+ u32 cr0;
+
+ if (is_smm(vcpu)) {
+ vcpu->arch.smi_pending = true;
+ return;
+ }
+
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ memset(buf, 0, 512);
+ if (guest_cpuid_has_longmode(vcpu))
+ process_smi_save_state_64(vcpu, buf);
+ else
+ process_smi_save_state_32(vcpu, buf);
+
+ kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+ if (kvm_x86_ops->get_nmi_mask(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_x86_ops->set_cr4(vcpu, 0);
+
+ __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+ if (guest_cpuid_has_longmode(vcpu))
+ kvm_x86_ops->set_efer(vcpu, 0);
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
+}
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
@@ -6269,12 +6501,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_handle_pmu_event(vcpu);
+ kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_deliver_pmi(vcpu);
+ kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6346,7 +6580,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- kvm_guest_enter();
+ __kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -7038,16 +7272,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
kvm_put_guest_xcr0(vcpu);
- if (!vcpu->guest_fpu_loaded)
+ if (!vcpu->guest_fpu_loaded) {
+ vcpu->fpu_counter = 0;
return;
+ }
vcpu->guest_fpu_loaded = 0;
copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- if (!vcpu->arch.eager_fpu)
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
-
+ /*
+ * If using eager FPU mode, or if the guest is a frequent user
+ * of the FPU, just leave the FPU active for next time.
+ * Every 255 times fpu_counter rolls over to 0; a guest that uses
+ * the FPU in bursts will revert to loading it on demand.
+ */
+ if (!vcpu->arch.eager_fpu) {
+ if (++vcpu->fpu_counter < 5)
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ }
trace_kvm_fpu(0);
}
@@ -7071,11 +7314,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu = kvm_x86_ops->vcpu_create(kvm, id);
- /*
- * Activate fpu unconditionally in case the guest needs eager FPU. It will be
- * deactivated soon if it doesn't.
- */
- kvm_x86_ops->fpu_activate(vcpu);
return vcpu;
}
@@ -7083,14 +7321,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
- vcpu->arch.mtrr_state.have_fixed = 1;
+ kvm_vcpu_mtrr_init(vcpu);
r = vcpu_load(vcpu);
if (r)
return r;
- kvm_vcpu_reset(vcpu);
+ kvm_vcpu_reset(vcpu, false);
kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
-
return r;
}
@@ -7107,6 +7344,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
@@ -7124,8 +7364,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ vcpu->arch.hflags = 0;
+
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7151,13 +7393,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- kvm_pmu_reset(vcpu);
+ if (!init_event) {
+ kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+ }
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_x86_ops->vcpu_reset(vcpu);
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7356,6 +7601,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
@@ -7462,6 +7709,40 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
+int __x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem)
+{
+ int i, r;
+
+ /* Called with kvm->slots_lock held. */
+ BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ struct kvm_userspace_memory_region m = *mem;
+
+ m.slot |= i << 16;
+ r = __kvm_set_memory_region(kvm, &m);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __x86_set_memory_region(kvm, mem);
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
@@ -7473,13 +7754,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
struct kvm_userspace_memory_region mem;
memset(&mem, 0, sizeof(mem));
mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
mem.slot = TSS_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
}
kvm_iommu_unmap_guest(kvm);
kfree(kvm->arch.vpic);
@@ -7568,18 +7849,18 @@ out_free:
return -ENOMEM;
}
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
{
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
- kvm_mmu_invalidate_mmio_sptes(kvm);
+ kvm_mmu_invalidate_mmio_sptes(kvm, slots);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
/*
@@ -7657,14 +7938,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
- if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+ if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
int ret;
ret = vm_munmap(old->userspace_addr,
@@ -7681,9 +7962,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- /* It's OK to get 'new' slot here as it has already been installed */
- new = id_to_memslot(kvm->memslots, mem->slot);
-
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
* sptes have to be split. If live migration is successful, the guest
@@ -7708,9 +7986,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* been zapped so no dirty logging staff is needed for old slot. For
* KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
* new and it's also covered when dealing with the new slot.
+ *
+ * FIXME: const-ify all uses of struct kvm_memory_slot.
*/
if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, new);
+ kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7932,6 +8212,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
kvm_x86_ops->interrupt_allowed(vcpu);
}
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
atomic_inc(&kvm->arch.noncoherent_dma_count);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f5fef1868096..0ca2f3e4803c 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
@@ -145,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
@@ -160,7 +167,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num);
#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
| XSTATE_BNDREGS | XSTATE_BNDCSR \
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index ddf9ecb53cc3..e342586db6e4 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -20,7 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
unsigned long ret;
if (__range_not_ok(from, n, TASK_SIZE))
- return 0;
+ return n;
/*
* Even though this function is typically called from NMI/IRQ context
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..8340e45c891a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
phys_addr_t start, end;
u64 i;
- for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4860906c6b9f..e1840f3db5b5 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
@@ -11,7 +12,19 @@
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
-extern unsigned char kasan_zero_page[PAGE_SIZE];
+static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
static int __init map_range(struct range *range)
{
@@ -36,7 +49,7 @@ static void __init clear_pgds(unsigned long start,
pgd_clear(pgd_offset_k(start));
}
-void __init kasan_map_early_shadow(pgd_t *pgd)
+static void __init kasan_map_early_shadow(pgd_t *pgd)
{
int i;
unsigned long start = KASAN_SHADOW_START;
@@ -73,7 +86,7 @@ static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
WARN_ON(!pmd_none(*pmd));
set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PMD_SIZE;
pmd = pmd_offset(pud, addr);
}
@@ -99,7 +112,7 @@ static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
WARN_ON(!pud_none(*pud));
set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PUD_SIZE;
pud = pud_offset(pgd, addr);
}
@@ -124,7 +137,7 @@ static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
WARN_ON(!pgd_none(*pgd));
set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
- | __PAGE_KERNEL_RO));
+ | _KERNPG_TABLE));
addr += PGDIR_SIZE;
pgd = pgd_offset_k(addr);
}
@@ -166,6 +179,26 @@ static struct notifier_block kasan_die_notifier = {
};
#endif
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ kasan_map_early_shadow(early_level4_pgt);
+ kasan_map_early_shadow(init_level4_pgt);
+}
+
void __init kasan_init(void)
{
int i;
@@ -176,6 +209,7 @@ void __init kasan_init(void)
memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
load_cr3(early_level4_pgt);
+ __flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -202,5 +236,8 @@ void __init kasan_init(void)
memset(kasan_zero_page, 0, PAGE_SIZE);
load_cr3(init_level4_pgt);
+ __flush_tlb_all();
init_task.kasan_depth = 0;
+
+ pr_info("Kernel address sanitizer initialized\n");
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ddeff4844a10..579a8fd74be0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <asm/cacheflush.h>
+#include <linux/bpf.h>
int bpf_jit_enable __read_mostly;
@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
return ptr + len;
}
-#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
+#define EMIT(bytes, len) \
+ do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
#define EMIT1(b1) EMIT(b1, 1)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
- int oldproglen, struct jit_context *ctx)
+#define STACKSIZE \
+ (MAX_BPF_STACK + \
+ 32 /* space for rbx, r13, r14, r15 */ + \
+ 8 /* space for skb_copy_bits() buffer */)
+
+#define PROLOGUE_SIZE 51
+
+/* emit x64 prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog)
{
- struct bpf_insn *insn = bpf_prog->insnsi;
- int insn_cnt = bpf_prog->len;
- bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
- bool seen_exit = false;
- u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
- int i;
- int proglen = 0;
- u8 *prog = temp;
- int stacksize = MAX_BPF_STACK +
- 32 /* space for rbx, r13, r14, r15 */ +
- 8 /* space for skb_copy_bits() buffer */;
+ u8 *prog = *pprog;
+ int cnt = 0;
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
- /* sub rsp, stacksize */
- EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+ /* sub rsp, STACKSIZE */
+ EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
/* all classic BPF filters use R6(rbx) save it */
/* mov qword ptr [rbp-X],rbx */
- EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+ EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
*/
/* mov qword ptr [rbp-X],r13 */
- EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+ EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
/* mov qword ptr [rbp-X],r14 */
- EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+ EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
/* mov qword ptr [rbp-X],r15 */
- EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+ EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
/* clear A and X registers */
EMIT2(0x31, 0xc0); /* xor eax, eax */
EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+ /* clear tail_cnt: mov qword ptr [rbp-X], rax */
+ EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ *pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ * if (index >= array->map.max_entries)
+ * goto out;
+ * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ * prog = array->prog[index];
+ * if (prog == NULL)
+ * goto out;
+ * goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int label1, label2, label3;
+ int cnt = 0;
+
+ /* rdi - pointer to ctx
+ * rsi - pointer to bpf_array
+ * rdx - index in bpf_array
+ */
+
+ /* if (index >= array->map.max_entries)
+ * goto out;
+ */
+ EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */
+ offsetof(struct bpf_array, map.max_entries));
+ EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */
+#define OFFSET1 44 /* number of bytes to jump */
+ EMIT2(X86_JBE, OFFSET1); /* jbe out */
+ label1 = cnt;
+
+ /* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 33
+ EMIT2(X86_JA, OFFSET2); /* ja out */
+ label2 = cnt;
+ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
+ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+
+ /* prog = array->prog[index]; */
+ EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */
+ EMIT1(offsetof(struct bpf_array, prog));
+ EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
+
+ /* if (prog == NULL)
+ * goto out;
+ */
+ EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */
+#define OFFSET3 10
+ EMIT2(X86_JE, OFFSET3); /* je out */
+ label3 = cnt;
+
+ /* goto *(prog->bpf_func + prologue_size); */
+ EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */
+ offsetof(struct bpf_prog, bpf_func));
+ EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */
+
+ /* now we're ready to jump into next BPF program
+ * rdi == ctx (1st arg)
+ * rax == prog->bpf_func + prologue_size
+ */
+ EMIT2(0xFF, 0xE0); /* jmp rax */
+
+ /* out: */
+ BUILD_BUG_ON(cnt - label1 != OFFSET1);
+ BUILD_BUG_ON(cnt - label2 != OFFSET2);
+ BUILD_BUG_ON(cnt - label3 != OFFSET3);
+ *pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+ bool seen_exit = false;
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ int i, cnt = 0;
+ int proglen = 0;
+ u8 *prog = temp;
+
+ emit_prologue(&prog);
+
if (seen_ld_abs) {
/* r9d : skb->len - skb->data_len (headlen)
* r10 : skb->data
@@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off))
}
break;
+ case BPF_JMP | BPF_CALL | BPF_X:
+ emit_bpf_tail_call(&prog);
+ break;
+
/* cond jump */
case BPF_JMP | BPF_JEQ | BPF_X:
case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +993,13 @@ common_load:
/* update cleanup_addr */
ctx->cleanup_addr = proglen;
/* mov rbx, qword ptr [rbp-X] */
- EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+ EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
/* mov r13, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+ EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
/* mov r14, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+ EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
/* mov r15, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+ EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
EMIT1(0xC9); /* leave */
EMIT1(0xC3); /* ret */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3b984c3aa1b0..cfba30f27392 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
now->tv_nsec = 0;
}
+void __init efi_find_mirror(void)
+{
+ void *p;
+ u64 mirror_size = 0, total_size = 0;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ total_size += size;
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ memblock_mark_mirror(start, size);
+ mirror_size += size;
+ }
+ }
+ if (mirror_size)
+ pr_info("Memory: %lldM/%lldM mirrored memory\n",
+ mirror_size>>20, total_size>>20);
+}
+
/*
* Tell the kernel about the EFI memory map. This might include
* more than the max 128 entries that can fit in the e820 legacy
@@ -153,6 +174,9 @@ static void __init do_add_efi_memmap(void)
case EFI_UNUSABLE_MEMORY:
e820_type = E820_UNUSABLE;
break;
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_PMEM;
+ break;
default:
/*
* EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 32947ba0f62d..ee40fcb6e54d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -173,5 +173,4 @@ static int __init intel_mid_device_create(void)
return platform_device_register(&vrtc_device);
}
-
-module_init(intel_mid_device_create);
+device_initcall(intel_mid_device_create);
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 7488cafab955..020c101c255f 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -104,7 +104,7 @@ static int param_set_local64(const char *val, const struct kernel_param *kp)
return 0;
}
-static struct kernel_param_ops param_ops_local64 = {
+static const struct kernel_param_ops param_ops_local64 = {
.get = param_get_local64,
.set = param_set_local64,
};
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index 4b181b74454f..ee940185e89f 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/in6.h>
+#include <linux/uaccess.h>
/*
* computes the checksum of a memory block at buff, length len,
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0a656b727b1a..548197212a45 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -200,8 +200,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
typedef struct user_i387_struct elf_fpregset_t;
-#define task_pt_regs(t) (&(t)->thread.regs)
-
struct task_struct;
extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 2a206d2b14ab..233ee09c1ce8 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -28,6 +28,8 @@ static inline void rep_nop(void)
#define cpu_relax() rep_nop()
#define cpu_relax_lowlatency() cpu_relax()
+#define task_pt_regs(t) (&(t)->thread.regs)
+
#include <asm/processor-generic.h>
#endif
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
index 45183fcd10b6..41dd5e1f3cd7 100644
--- a/arch/x86/um/asm/segment.h
+++ b/arch/x86/um/asm/segment.h
@@ -7,4 +7,12 @@ extern int host_gdt_entry_tls_min;
#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+#define KERNEL_DS MAKE_MM_SEG(~0UL)
+#define USER_DS MAKE_MM_SEG(TASK_SIZE)
+
#endif
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 5c0b711d2433..9701a4fd7bf2 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <os.h>
#include <skas.h>
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index f40281e5d6a2..744afdc18cf3 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -7,8 +7,7 @@
*/
#include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
static struct vm_area_struct gate_vma;
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index f8fecaddcc0d..7642e2e2aa61 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -1,6 +1,5 @@
#include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
const char *arch_vma_name(struct vm_area_struct *vma)
{
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index ce3dd4f36f3f..a29756f2d940 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
#include <skas.h>
extern int arch_switch_tls(struct task_struct *to);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 3b52bf0b418a..a629694ee750 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
/*
* determines which flags the user has access to.
diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h
index 27cce00c6b30..a682db13df23 100644
--- a/arch/x86/um/shared/sysdep/tls.h
+++ b/arch/x86/um/shared/sysdep/tls.h
@@ -1,7 +1,7 @@
#ifndef _SYSDEP_TLS_H
#define _SYSDEP_TLS_H
-# ifndef __KERNEL__
+#ifdef __UM_HOST__
/* Change name to avoid conflicts with the original one from <asm/ldt.h>, which
* may be named user_desc (but in 2.4 and in header matching its API was named
@@ -22,11 +22,11 @@ typedef struct um_dup_user_desc {
#endif
} user_desc_t;
-# else /* __KERNEL__ */
+#else /* __UM_HOST__ */
typedef struct user_desc user_desc_t;
-# endif /* __KERNEL__ */
+#endif /* __UM_HOST__ */
extern int os_set_thread_area(user_desc_t *info, int pid);
extern int os_get_thread_area(user_desc_t *info, int pid);
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 592491d1d70d..06934a8a4872 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -541,7 +541,8 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
*/
/* x86-64 should always use SA_RESTORER. */
if (ksig->ka.sa.sa_flags & SA_RESTORER)
- err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
+ err |= __put_user((void *)ksig->ka.sa.sa_restorer,
+ &frame->pretcode);
else
/* could use a vstub here */
return err;
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index adb08eb5c22a..e6552275320b 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
*/
#include <linux/sched.h>
+#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <os.h>
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 80ffa5b9982d..48e38584d5c1 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
#include <os.h>
#include <skas.h>
#include <sysdep/tls.h>
diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c
index d22363cb854e..3ad714373d7f 100644
--- a/arch/x86/um/tls_64.c
+++ b/arch/x86/um/tls_64.c
@@ -1,4 +1,5 @@
#include <linux/sched.h>
+#include <asm/ptrace-abi.h>
void clear_flushed_tls(struct task_struct *task)
{
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 916cda4cd5b4..237c6831e095 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <asm/page.h>
+#include <asm/elf.h>
#include <linux/init.h>
static unsigned int __read_mostly vdso_enabled = 1;
OpenPOWER on IntegriCloud