summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/Makefile33
-rw-r--r--arch/x86/Makefile_32.cpu7
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S47
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S231
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S283
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c208
-rw-r--r--arch/x86/crypto/glue_helper.c3
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb.c7
-rw-r--r--arch/x86/entry/vdso/vma.c3
-rw-r--r--arch/x86/include/asm/compat.h1
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/hugetlb.h4
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h50
-rw-r--r--arch/x86/include/asm/mshyperv.h1
-rw-r--r--arch/x86/include/asm/msr-index.h20
-rw-r--r--arch/x86/include/asm/pmem.h136
-rw-r--r--arch/x86/include/asm/string_64.h5
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h11
-rw-r--r--arch/x86/include/asm/xen/hypercall.h13
-rw-r--r--arch/x86/include/uapi/asm/Kbuild6
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h6
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/amd_gart_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c79
-rw-r--r--arch/x86/kernel/cpu/proc.c10
-rw-r--r--arch/x86/kernel/pci-calgary_64.c25
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c84
-rw-r--r--arch/x86/kvm/lapic.c116
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c155
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/svm.c95
-rw-r--r--arch/x86/kvm/vmx.c83
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--arch/x86/lib/usercopy_64.c134
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init_32.c7
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/pageattr.c6
-rw-r--r--arch/x86/net/bpf_jit.S20
-rw-r--r--arch/x86/net/bpf_jit_comp.c66
-rw-r--r--arch/x86/pci/common.c27
-rw-r--r--arch/x86/pci/fixup.c47
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/pci/sta2x11-fixup.c3
-rw-r--r--arch/x86/power/hibernate_64.c6
-rw-r--r--arch/x86/um/vdso/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c154
-rw-r--r--arch/x86/xen/enlighten_hvm.c64
-rw-r--r--arch/x86/xen/enlighten_pv.c89
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c14
-rw-r--r--arch/x86/xen/setup.c7
-rw-r--r--arch/x86/xen/smp.c31
-rw-r--r--arch/x86/xen/smp.h2
-rw-r--r--arch/x86/xen/smp_hvm.c14
-rw-r--r--arch/x86/xen/smp_pv.c6
-rw-r--r--arch/x86/xen/suspend_hvm.c11
-rw-r--r--arch/x86/xen/xen-ops.h3
67 files changed, 1643 insertions, 881 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e767ed24aeb4..94a18681353d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
def_bool y
depends on 64BIT
# Options that are inherently 64-bit kernel only:
- select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_SUPPORTS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
@@ -54,11 +54,13 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_ZONE_DEVICE if X86_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -72,6 +74,7 @@ config X86
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index ad2db82e9953..1e902f926be3 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -11,6 +11,14 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+# For gcc stack alignment is specified with -mpreferred-stack-boundary,
+# clang has the option -mstack-alignment for that purpose.
+ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
+ cc_stack_align_opt := -mpreferred-stack-boundary
+else ifneq ($(call cc-option, -mstack-alignment=4),)
+ cc_stack_align_opt := -mstack-alignment
+endif
+
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
#
@@ -24,10 +32,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
-DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
- -mno-mmx -mno-sse \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
+ -mno-mmx -mno-sse
+
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -64,8 +73,10 @@ ifeq ($(CONFIG_X86_32),y)
# with nonstandard options
KBUILD_CFLAGS += -fno-pic
- # prevent gcc from keeping the stack 16 byte aligned
- KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+ # Align the stack to the register width instead of using the default
+ # alignment of 16 bytes. This reduces stack usage and the number of
+ # alignment instructions.
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots:
@@ -97,8 +108,14 @@ else
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
- # Use -mpreferred-stack-boundary=3 if supported.
- KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
+ # By default gcc and clang use a stack alignment of 16 bytes for x86.
+ # However the standard kernel entry on x86-64 leaves the stack on an
+ # 8-byte boundary. If the compiler isn't informed about the actual
+ # alignment it will generate extra alignment instructions for the
+ # default alignment which keep the stack *mis*aligned.
+ # Furthermore an alignment to the register width reduces stack usage
+ # and the number of alignment instructions.
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index a45eb15b7cf2..f3717d36718a 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -9,7 +9,6 @@ else
tune = $(call cc-option,-mcpu=$(1),$(2))
endif
-align := $(cc-option-align)
cflags-$(CONFIG_M486) += -march=i486
cflags-$(CONFIG_M586) += -march=i586
cflags-$(CONFIG_M586TSC) += -march=i586
@@ -24,11 +23,11 @@ cflags-$(CONFIG_MK6) += -march=k6
# They make zero difference whatsosever to performance at this time.
cflags-$(CONFIG_MK7) += -march=athlon
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
-cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
-cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
-cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
cflags-$(CONFIG_MVIAC7) += -march=i686
cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 910565547163..8739cf7795de 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -42,17 +42,15 @@
#define R5E %esi
#define R6 %rdi
#define R6E %edi
-#define R7 %rbp
-#define R7E %ebp
+#define R7 %r9 /* don't use %rbp; it breaks stack traces */
+#define R7E %r9d
#define R8 %r8
-#define R9 %r9
#define R10 %r10
#define R11 %r11
-#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \
ENTRY(FUNC); \
movq r1,r2; \
- movq r3,r4; \
leaq KEY+48(r8),r9; \
movq r10,r11; \
movl (r7),r5 ## E; \
@@ -70,9 +68,8 @@
je B192; \
leaq 32(r9),r9;
-#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \
+#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \
movq r1,r2; \
- movq r3,r4; \
movl r5 ## E,(r9); \
movl r6 ## E,4(r9); \
movl r7 ## E,8(r9); \
@@ -88,12 +85,12 @@
movl TAB(,r6,4),r6 ## E; \
roll $16,r2 ## E; \
shrl $16,r4 ## E; \
- movzbl r4 ## H,r7 ## E; \
- movzbl r4 ## L,r4 ## E; \
+ movzbl r4 ## L,r7 ## E; \
+ movzbl r4 ## H,r4 ## E; \
xorl OFFSET(r8),ra ## E; \
xorl OFFSET+4(r8),rb ## E; \
- xorl TAB+3072(,r7,4),r5 ## E;\
- xorl TAB+2048(,r4,4),r6 ## E;\
+ xorl TAB+3072(,r4,4),r5 ## E;\
+ xorl TAB+2048(,r7,4),r6 ## E;\
movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r4 ## E; \
movl TAB+1024(,r4,4),r4 ## E;\
@@ -101,19 +98,19 @@
roll $16,r1 ## E; \
shrl $16,r3 ## E; \
xorl TAB(,r7,4),r5 ## E; \
- movzbl r3 ## H,r7 ## E; \
- movzbl r3 ## L,r3 ## E; \
- xorl TAB+3072(,r7,4),r4 ## E;\
- xorl TAB+2048(,r3,4),r5 ## E;\
- movzbl r1 ## H,r7 ## E; \
- movzbl r1 ## L,r3 ## E; \
+ movzbl r3 ## L,r7 ## E; \
+ movzbl r3 ## H,r3 ## E; \
+ xorl TAB+3072(,r3,4),r4 ## E;\
+ xorl TAB+2048(,r7,4),r5 ## E;\
+ movzbl r1 ## L,r7 ## E; \
+ movzbl r1 ## H,r3 ## E; \
shrl $16,r1 ## E; \
- xorl TAB+3072(,r7,4),r6 ## E;\
- movl TAB+2048(,r3,4),r3 ## E;\
- movzbl r1 ## H,r7 ## E; \
- movzbl r1 ## L,r1 ## E; \
- xorl TAB+1024(,r7,4),r6 ## E;\
- xorl TAB(,r1,4),r3 ## E; \
+ xorl TAB+3072(,r3,4),r6 ## E;\
+ movl TAB+2048(,r7,4),r3 ## E;\
+ movzbl r1 ## L,r7 ## E; \
+ movzbl r1 ## H,r1 ## E; \
+ xorl TAB+1024(,r1,4),r6 ## E;\
+ xorl TAB(,r7,4),r3 ## E; \
movzbl r2 ## H,r1 ## E; \
movzbl r2 ## L,r7 ## E; \
shrl $16,r2 ## E; \
@@ -131,9 +128,9 @@
movl r4 ## E,r2 ## E;
#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+ prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11)
-#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11)
#define encrypt_round(TAB,OFFSET) \
round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3c465184ff8a..16627fec80b2 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
+.section .rodata
+.align 16
+.type aad_shift_arr, @object
+.size aad_shift_arr, 272
+aad_shift_arr:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0xffffffffffffffffffffffffffffff0C
+ .octa 0xffffffffffffffffffffffffffff0D0C
+ .octa 0xffffffffffffffffffffffffff0E0D0C
+ .octa 0xffffffffffffffffffffffff0F0E0D0C
+ .octa 0xffffffffffffffffffffff0C0B0A0908
+ .octa 0xffffffffffffffffffff0D0C0B0A0908
+ .octa 0xffffffffffffffffff0E0D0C0B0A0908
+ .octa 0xffffffffffffffff0F0E0D0C0B0A0908
+ .octa 0xffffffffffffff0C0B0A090807060504
+ .octa 0xffffffffffff0D0C0B0A090807060504
+ .octa 0xffffffffff0E0D0C0B0A090807060504
+ .octa 0xffffffff0F0E0D0C0B0A090807060504
+ .octa 0xffffff0C0B0A09080706050403020100
+ .octa 0xffff0D0C0B0A09080706050403020100
+ .octa 0xff0E0D0C0B0A09080706050403020100
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
.text
@@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11
pxor %xmm\i, %xmm\i
+ pxor \XMM2, \XMM2
-_get_AAD_loop\num_initial_blocks\operation:
- movd (%r10), \TMP1
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
+ cmp $16, %r11
+ jl _get_AAD_rest8\num_initial_blocks\operation
+_get_AAD_blocks\num_initial_blocks\operation:
+ movdqu (%r10), %xmm\i
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+ pxor %xmm\i, \XMM2
+ GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\num_initial_blocks\operation
+
+ movdqu \XMM2, %xmm\i
+ cmp $0, %r11
+ je _get_AAD_done\num_initial_blocks\operation
+
+ pxor %xmm\i,%xmm\i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\num_initial_blocks\operation:
+ cmp $4, %r11
+ jle _get_AAD_rest4\num_initial_blocks\operation
+ movq (%r10), \TMP1
+ add $8, %r10
+ sub $8, %r11
+ pslldq $8, \TMP1
+ psrldq $8, %xmm\i
pxor \TMP1, %xmm\i
+ jmp _get_AAD_rest8\num_initial_blocks\operation
+_get_AAD_rest4\num_initial_blocks\operation:
+ cmp $0, %r11
+ jle _get_AAD_rest0\num_initial_blocks\operation
+ mov (%r10), %eax
+ movq %rax, \TMP1
add $4, %r10
- sub $4, %r12
- jne _get_AAD_loop\num_initial_blocks\operation
-
- cmp $16, %r11
- je _get_AAD_loop2_done\num_initial_blocks\operation
-
- mov $16, %r12
-_get_AAD_loop2\num_initial_blocks\operation:
+ sub $4, %r10
+ pslldq $12, \TMP1
psrldq $4, %xmm\i
- sub $4, %r12
- cmp %r11, %r12
- jne _get_AAD_loop2\num_initial_blocks\operation
-
-_get_AAD_loop2_done\num_initial_blocks\operation:
+ pxor \TMP1, %xmm\i
+_get_AAD_rest0\num_initial_blocks\operation:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \TMP1
+ PSHUFB_XMM \TMP1, %xmm\i
+_get_AAD_rest_final\num_initial_blocks\operation:
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+ pxor \XMM2, %xmm\i
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+_get_AAD_done\num_initial_blocks\operation:
xor %r11, %r11 # initialise the data pointer offset as zero
-
- # start AES for num_initial_blocks blocks
+ # start AES for num_initial_blocks blocks
mov %arg5, %rax # %rax = *Y0
movdqu (%rax), \XMM0 # XMM0 = Y0
@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
# apply GHASH on num_initial_blocks blocks
.if \i == 5
@@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11
pxor %xmm\i, %xmm\i
-_get_AAD_loop\num_initial_blocks\operation:
- movd (%r10), \TMP1
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
+ pxor \XMM2, \XMM2
+
+ cmp $16, %r11
+ jl _get_AAD_rest8\num_initial_blocks\operation
+_get_AAD_blocks\num_initial_blocks\operation:
+ movdqu (%r10), %xmm\i
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+ pxor %xmm\i, \XMM2
+ GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\num_initial_blocks\operation
+
+ movdqu \XMM2, %xmm\i
+ cmp $0, %r11
+ je _get_AAD_done\num_initial_blocks\operation
+
+ pxor %xmm\i,%xmm\i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some PT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\num_initial_blocks\operation:
+ cmp $4, %r11
+ jle _get_AAD_rest4\num_initial_blocks\operation
+ movq (%r10), \TMP1
+ add $8, %r10
+ sub $8, %r11
+ pslldq $8, \TMP1
+ psrldq $8, %xmm\i
pxor \TMP1, %xmm\i
+ jmp _get_AAD_rest8\num_initial_blocks\operation
+_get_AAD_rest4\num_initial_blocks\operation:
+ cmp $0, %r11
+ jle _get_AAD_rest0\num_initial_blocks\operation
+ mov (%r10), %eax
+ movq %rax, \TMP1
add $4, %r10
- sub $4, %r12
- jne _get_AAD_loop\num_initial_blocks\operation
- cmp $16, %r11
- je _get_AAD_loop2_done\num_initial_blocks\operation
- mov $16, %r12
-_get_AAD_loop2\num_initial_blocks\operation:
+ sub $4, %r10
+ pslldq $12, \TMP1
psrldq $4, %xmm\i
- sub $4, %r12
- cmp %r11, %r12
- jne _get_AAD_loop2\num_initial_blocks\operation
-_get_AAD_loop2_done\num_initial_blocks\operation:
+ pxor \TMP1, %xmm\i
+_get_AAD_rest0\num_initial_blocks\operation:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \TMP1
+ PSHUFB_XMM \TMP1, %xmm\i
+_get_AAD_rest_final\num_initial_blocks\operation:
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+ pxor \XMM2, %xmm\i
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+_get_AAD_done\num_initial_blocks\operation:
xor %r11, %r11 # initialise the data pointer offset as zero
-
- # start AES for num_initial_blocks blocks
+ # start AES for num_initial_blocks blocks
mov %arg5, %rax # %rax = *Y0
movdqu (%rax), \XMM0 # XMM0 = Y0
@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
# apply GHASH on num_initial_blocks blocks
.if \i == 5
@@ -1454,18 +1549,35 @@ _return_T_decrypt:
mov arg10, %r11 # %r11 = auth_tag_len
cmp $16, %r11
je _T_16_decrypt
- cmp $12, %r11
- je _T_12_decrypt
+ cmp $8, %r11
+ jl _T_4_decrypt
_T_8_decrypt:
MOVQ_R64_XMM %xmm0, %rax
mov %rax, (%r10)
- jmp _return_T_done_decrypt
-_T_12_decrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
psrldq $8, %xmm0
+ cmp $0, %r11
+ je _return_T_done_decrypt
+_T_4_decrypt:
+ movd %xmm0, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ psrldq $4, %xmm0
+ cmp $0, %r11
+ je _return_T_done_decrypt
+_T_123_decrypt:
movd %xmm0, %eax
- mov %eax, 8(%r10)
+ cmp $2, %r11
+ jl _T_1_decrypt
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done_decrypt
+ add $2, %r10
+ sar $16, %eax
+_T_1_decrypt:
+ mov %al, (%r10)
jmp _return_T_done_decrypt
_T_16_decrypt:
movdqu %xmm0, (%r10)
@@ -1718,18 +1830,35 @@ _return_T_encrypt:
mov arg10, %r11 # %r11 = auth_tag_len
cmp $16, %r11
je _T_16_encrypt
- cmp $12, %r11
- je _T_12_encrypt
+ cmp $8, %r11
+ jl _T_4_encrypt
_T_8_encrypt:
MOVQ_R64_XMM %xmm0, %rax
mov %rax, (%r10)
- jmp _return_T_done_encrypt
-_T_12_encrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
psrldq $8, %xmm0
+ cmp $0, %r11
+ je _return_T_done_encrypt
+_T_4_encrypt:
+ movd %xmm0, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ psrldq $4, %xmm0
+ cmp $0, %r11
+ je _return_T_done_encrypt
+_T_123_encrypt:
movd %xmm0, %eax
- mov %eax, 8(%r10)
+ cmp $2, %r11
+ jl _T_1_encrypt
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done_encrypt
+ add $2, %r10
+ sar $16, %eax
+_T_1_encrypt:
+ mov %al, (%r10)
jmp _return_T_done_encrypt
_T_16_encrypt:
movdqu %xmm0, (%r10)
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index d664382c6e56..faecb1518bf8 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -155,6 +155,30 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
+.section .rodata
+.align 16
+.type aad_shift_arr, @object
+.size aad_shift_arr, 272
+aad_shift_arr:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0xffffffffffffffffffffffffffffff0C
+ .octa 0xffffffffffffffffffffffffffff0D0C
+ .octa 0xffffffffffffffffffffffffff0E0D0C
+ .octa 0xffffffffffffffffffffffff0F0E0D0C
+ .octa 0xffffffffffffffffffffff0C0B0A0908
+ .octa 0xffffffffffffffffffff0D0C0B0A0908
+ .octa 0xffffffffffffffffff0E0D0C0B0A0908
+ .octa 0xffffffffffffffff0F0E0D0C0B0A0908
+ .octa 0xffffffffffffff0C0B0A090807060504
+ .octa 0xffffffffffff0D0C0B0A090807060504
+ .octa 0xffffffffff0E0D0C0B0A090807060504
+ .octa 0xffffffff0F0E0D0C0B0A090807060504
+ .octa 0xffffff0C0B0A09080706050403020100
+ .octa 0xffff0D0C0B0A09080706050403020100
+ .octa 0xff0E0D0C0B0A09080706050403020100
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+
.text
@@ -372,41 +396,72 @@ VARIABLE_OFFSET = 16*8
.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
+ j = 0
setreg
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_i, reg_i, reg_i
-_get_AAD_loop\@:
- vmovd (%r10), \T1
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-
- add $4, %r10
- sub $4, %r12
- jg _get_AAD_loop\@
-
-
- cmp $16, %r11
- je _get_AAD_loop2_done\@
- mov $16, %r12
-
-_get_AAD_loop2\@:
- vpsrldq $4, reg_i, reg_i
- sub $4, %r12
- cmp %r11, %r12
- jg _get_AAD_loop2\@
-
-_get_AAD_loop2_done\@:
-
- #byte-reflect the AAD data
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
-
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_j, reg_j, reg_j
+ vpxor reg_i, reg_i, reg_i
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu reg_j, reg_i
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor reg_i, reg_i, reg_i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, reg_i, reg_i
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_j, reg_i, reg_i
+ GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
# initialize the data pointer offset as zero
xor %r11, %r11
@@ -480,7 +535,6 @@ _get_AAD_loop2_done\@:
i = (8-\num_initial_blocks)
j = (9-\num_initial_blocks)
setreg
- GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
.rep \num_initial_blocks
vpxor reg_i, reg_j, reg_j
@@ -1427,19 +1481,36 @@ _return_T\@:
cmp $16, %r11
je _T_16\@
- cmp $12, %r11
- je _T_12\@
+ cmp $8, %r11
+ jl _T_4\@
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
- jmp _return_T_done\@
-_T_12\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
vmovd %xmm9, %eax
- mov %eax, 8(%r10)
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
jmp _return_T_done\@
_T_16\@:
@@ -1631,41 +1702,73 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
+ j = 0
setreg
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_i, reg_i, reg_i
-_get_AAD_loop\@:
- vmovd (%r10), \T1
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-
- add $4, %r10
- sub $4, %r12
- jg _get_AAD_loop\@
-
-
- cmp $16, %r11
- je _get_AAD_loop2_done\@
- mov $16, %r12
-
-_get_AAD_loop2\@:
- vpsrldq $4, reg_i, reg_i
- sub $4, %r12
- cmp %r11, %r12
- jg _get_AAD_loop2\@
-
-_get_AAD_loop2_done\@:
-
- #byte-reflect the AAD data
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
-
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_j, reg_j, reg_j
+ vpxor reg_i, reg_i, reg_i
+
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu reg_j, reg_i
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor reg_i, reg_i, reg_i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, reg_i, reg_i
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_j, reg_i, reg_i
+ GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
# initialize the data pointer offset as zero
xor %r11, %r11
@@ -1740,7 +1843,6 @@ _get_AAD_loop2_done\@:
i = (8-\num_initial_blocks)
j = (9-\num_initial_blocks)
setreg
- GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
.rep \num_initial_blocks
vpxor reg_i, reg_j, reg_j
@@ -2702,19 +2804,36 @@ _return_T\@:
cmp $16, %r11
je _T_16\@
- cmp $12, %r11
- je _T_12\@
+ cmp $8, %r11
+ jl _T_4\@
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
- jmp _return_T_done\@
-_T_12\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
vmovd %xmm9, %eax
- mov %eax, 8(%r10)
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
jmp _return_T_done\@
_T_16\@:
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 93de8ea51548..4a55cdcdc008 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -61,6 +61,11 @@ struct aesni_rfc4106_gcm_ctx {
u8 nonce[4];
};
+struct generic_gcmaes_ctx {
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
+};
+
struct aesni_xts_ctx {
u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
@@ -102,13 +107,11 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
* u8 *out, Ciphertext output. Encrypt in-place is allowed.
* const u8 *in, Plaintext input
* unsigned long plaintext_len, Length of data in bytes for encryption.
- * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
- * concatenated with 8 byte Initialisation Vector (from IPSec ESP
- * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
* const u8 *aad, Additional Authentication Data (AAD)
- * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
- * is going to be 8 or 12 bytes
+ * unsigned long aad_len, Length of AAD in bytes.
* u8 *auth_tag, Authenticated Tag output.
* unsigned long auth_tag_len), Authenticated Tag Length in bytes.
* Valid values are 16 (most likely), 12 or 8.
@@ -123,9 +126,8 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
* u8 *out, Plaintext output. Decrypt in-place is allowed.
* const u8 *in, Ciphertext input
* unsigned long ciphertext_len, Length of data in bytes for decryption.
- * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
- * concatenated with 8 byte Initialisation Vector (from IPSec ESP
- * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
* const u8 *aad, Additional Authentication Data (AAD)
* unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
@@ -275,6 +277,16 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
align = 1;
return PTR_ALIGN(crypto_aead_ctx(tfm), align);
}
+
+static inline struct
+generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
+{
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+}
#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
@@ -712,32 +724,34 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
}
-static int helper_rfc4106_encrypt(struct aead_request *req)
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
u8 one_entry_in_sg = 0;
u8 *src, *dst, *assoc;
- __be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
struct scatter_walk src_sg_walk;
struct scatter_walk dst_sg_walk = {};
- unsigned int i;
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length equal */
- /* to 16 or 20 bytes */
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
@@ -768,7 +782,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
kernel_fpu_begin();
aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv,
- ctx->hash_subkey, assoc, req->assoclen - 8,
+ hash_subkey, assoc, assoclen,
dst + req->cryptlen, auth_tag_len);
kernel_fpu_end();
@@ -791,37 +805,20 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
return 0;
}
-static int helper_rfc4106_decrypt(struct aead_request *req)
+static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
u8 one_entry_in_sg = 0;
u8 *src, *dst, *assoc;
unsigned long tempCipherLen = 0;
- __be32 counter = cpu_to_be32(1);
- int retval = 0;
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
u8 authTag[16];
struct scatter_walk src_sg_walk;
struct scatter_walk dst_sg_walk = {};
- unsigned int i;
-
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length */
- /* equal to 16 or 20 bytes */
+ int retval = 0;
tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
@@ -838,7 +835,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
scatterwalk_start(&dst_sg_walk, req->dst);
dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
}
-
} else {
/* Allocate memory for src, dst, assoc */
assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
@@ -850,9 +846,10 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
dst = src;
}
+
kernel_fpu_begin();
aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
- ctx->hash_subkey, assoc, req->assoclen - 8,
+ hash_subkey, assoc, assoclen,
authTag, auth_tag_len);
kernel_fpu_end();
@@ -875,6 +872,60 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
kfree(assoc);
}
return retval;
+
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ unsigned int i;
+ __be32 counter = cpu_to_be32(1);
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length equal */
+ /* to 16 or 20 bytes */
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ unsigned int i;
+
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length */
+ /* equal to 16 or 20 bytes */
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
}
static int rfc4106_encrypt(struct aead_request *req)
@@ -1035,6 +1086,46 @@ struct {
};
#ifdef CONFIG_X86_64
+static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
+{
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
+
+ return aes_set_key_common(crypto_aead_tfm(aead),
+ &ctx->aes_key_expanded, key, key_len) ?:
+ rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+}
+
+static int generic_gcmaes_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ __be32 counter = cpu_to_be32(1);
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int generic_gcmaes_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
static struct aead_alg aesni_aead_algs[] = { {
.setkey = common_rfc4106_set_key,
.setauthsize = common_rfc4106_set_authsize,
@@ -1069,6 +1160,23 @@ static struct aead_alg aesni_aead_algs[] = { {
.cra_ctxsize = sizeof(struct cryptd_aead *),
.cra_module = THIS_MODULE,
},
+}, {
+ .setkey = generic_gcmaes_set_key,
+ .setauthsize = generic_gcmaes_set_authsize,
+ .encrypt = generic_gcmaes_encrypt,
+ .decrypt = generic_gcmaes_decrypt,
+ .ivsize = 12,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "gcm(aes)",
+ .cra_driver_name = "generic-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
} };
#else
static struct aead_alg aesni_aead_algs[0];
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 24ac9fad832d..d61e57960fe0 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -176,9 +176,6 @@ __glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
src -= 1;
dst -= 1;
} while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- goto done;
}
}
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index 2dd3674b5a1e..458409b7568d 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -269,19 +269,19 @@ static struct sha512_hash_ctx
* LAST
*/
ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
- return ctx;
+ goto unlock;
}
if (ctx->status & HASH_CTX_STS_PROCESSING) {
/* Cannot submit to a currently processing job. */
ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
- return ctx;
+ goto unlock;
}
if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
/* Cannot update a finished job. */
ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
- return ctx;
+ goto unlock;
}
@@ -363,6 +363,7 @@ static struct sha512_hash_ctx
}
ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+unlock:
spin_unlock_irqrestore(&cstate->work_lock, irqflags);
return ctx;
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 139ad7726e10..726355ce8497 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -78,9 +78,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
if (image->size != new_size)
return -EINVAL;
- if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
- return -EFAULT;
-
vdso_fix_landing(image, new_vma);
current->mm->context.vdso = (void __user *)new_vma->vm_start;
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 24118c0b4640..5343c19814b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -116,7 +116,6 @@ struct compat_statfs {
int f_spare[4];
};
-#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
#define COMPAT_RLIM_INFINITY 0xffffffff
typedef u32 compat_old_sigset_t; /* at least 32 bits */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 08a0838b83fb..398c79889f5c 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -19,8 +19,6 @@
# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
#endif
-#define DMA_ERROR_CODE 0
-
extern int iommu_merge;
extern struct device x86_dma_fallback_dev;
extern int panic_on_overflow;
@@ -35,9 +33,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
#define arch_dma_alloc_attrs arch_dma_alloc_attrs
-#define HAVE_ARCH_DMA_SUPPORTED 1
-extern int dma_supported(struct device *hwdev, u64 mask);
-
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
unsigned long attrs);
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 3a106165e03a..535af0f2d8ac 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
}
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void) { return true; }
+#endif
+
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 793869879464..fca144a104e4 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,8 @@ extern int force_iommu, no_iommu;
extern int iommu_detected;
extern int iommu_pass_through;
+int x86_dma_supported(struct device *dev, u64 mask);
+
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605eb1dfb..1588e9e3dc01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,28 +48,31 @@
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
/* x86-specific vcpu->requests bit members */
-#define KVM_REQ_MIGRATE_TIMER 8
-#define KVM_REQ_REPORT_TPR_ACCESS 9
-#define KVM_REQ_TRIPLE_FAULT 10
-#define KVM_REQ_MMU_SYNC 11
-#define KVM_REQ_CLOCK_UPDATE 12
-#define KVM_REQ_EVENT 14
-#define KVM_REQ_APF_HALT 15
-#define KVM_REQ_STEAL_UPDATE 16
-#define KVM_REQ_NMI 17
-#define KVM_REQ_PMU 18
-#define KVM_REQ_PMI 19
-#define KVM_REQ_SMI 20
-#define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_HV_CRASH 26
-#define KVM_REQ_IOAPIC_EOI_EXIT 27
-#define KVM_REQ_HV_RESET 28
-#define KVM_REQ_HV_EXIT 29
-#define KVM_REQ_HV_STIMER 30
+#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
+#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
+#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2)
+#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3)
+#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4)
+#define KVM_REQ_EVENT KVM_ARCH_REQ(6)
+#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7)
+#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8)
+#define KVM_REQ_NMI KVM_ARCH_REQ(9)
+#define KVM_REQ_PMU KVM_ARCH_REQ(10)
+#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
+#define KVM_REQ_MCLOCK_INPROGRESS \
+ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC \
+ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16)
+#define KVM_REQ_APIC_PAGE_RELOAD \
+ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18)
+#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19)
+#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
+#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
+#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -254,7 +257,8 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
- unsigned :8;
+ unsigned ad_disabled:1;
+ unsigned :7;
/*
* This is left at the top of the word so that
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index d5acc27ed1cc..2b58c8c1eeaa 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -136,7 +136,6 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
}
}
-#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick)
#define hv_init_timer(timer, tick) wrmsrl(timer, tick)
#define hv_init_timer_config(config, val) wrmsrl(config, val)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 18b162322eff..5573c75f8e4c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -251,9 +251,13 @@
#define HWP_MIN_PERF(x) (x & 0xff)
#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
-#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24)
-#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32)
-#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42)
+#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24)
+#define HWP_EPP_PERFORMANCE 0x00
+#define HWP_EPP_BALANCE_PERFORMANCE 0x80
+#define HWP_EPP_BALANCE_POWERSAVE 0xC0
+#define HWP_EPP_POWERSAVE 0xFF
+#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32)
+#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42)
/* IA32_HWP_STATUS */
#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
@@ -422,6 +426,8 @@
#define MSR_IA32_TSC_ADJUST 0x0000003b
#define MSR_IA32_BNDCFGS 0x00000d90
+#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+
#define MSR_IA32_XSS 0x00000da0
#define FEATURE_CONTROL_LOCKED (1<<0)
@@ -476,9 +482,11 @@
#define MSR_MISC_PWR_MGMT 0x000001aa
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
-#define ENERGY_PERF_BIAS_PERFORMANCE 0
-#define ENERGY_PERF_BIAS_NORMAL 6
-#define ENERGY_PERF_BIAS_POWERSAVE 15
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
+#define ENERGY_PERF_BIAS_POWERSAVE 15
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
deleted file mode 100644
index 0ff8fe71b255..000000000000
--- a/arch/x86/include/asm/pmem.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright(c) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#ifndef __ASM_X86_PMEM_H__
-#define __ASM_X86_PMEM_H__
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
-#include <asm/special_insns.h>
-
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent pmem driver flush operation will drain posted write queues.
- */
-static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
-{
- int rem;
-
- /*
- * We are copying between two kernel buffers, if
- * __copy_from_user_inatomic_nocache() returns an error (page
- * fault) we would have already reported a general protection fault
- * before the WARN+BUG.
- */
- rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n);
- if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n",
- __func__, dst, src, rem))
- BUG();
-}
-
-/**
- * arch_wb_cache_pmem - write back a cache range with CLWB
- * @vaddr: virtual start address
- * @size: number of bytes to write back
- *
- * Write back a cache range using the CLWB (cache line write back)
- * instruction. Note that @size is internally rounded up to be cache
- * line size aligned.
- */
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
- u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
- unsigned long clflush_mask = x86_clflush_size - 1;
- void *vend = addr + size;
- void *p;
-
- for (p = (void *)((unsigned long)addr & ~clflush_mask);
- p < vend; p += x86_clflush_size)
- clwb(p);
-}
-
-/**
- * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
- * @addr: PMEM destination address
- * @bytes: number of bytes to copy
- * @i: iterator with source data
- *
- * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
- */
-static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
- struct iov_iter *i)
-{
- size_t len;
-
- /* TODO: skip the write-back by always using non-temporal stores */
- len = copy_from_iter_nocache(addr, bytes, i);
-
- /*
- * In the iovec case on x86_64 copy_from_iter_nocache() uses
- * non-temporal stores for the bulk of the transfer, but we need
- * to manually flush if the transfer is unaligned. A cached
- * memory copy is used when destination or size is not naturally
- * aligned. That is:
- * - Require 8-byte alignment when size is 8 bytes or larger.
- * - Require 4-byte alignment when size is 4 bytes.
- *
- * In the non-iovec case the entire destination needs to be
- * flushed.
- */
- if (iter_is_iovec(i)) {
- unsigned long flushed, dest = (unsigned long) addr;
-
- if (bytes < 8) {
- if (!IS_ALIGNED(dest, 4) || (bytes != 4))
- arch_wb_cache_pmem(addr, bytes);
- } else {
- if (!IS_ALIGNED(dest, 8)) {
- dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
- arch_wb_cache_pmem(addr, 1);
- }
-
- flushed = dest - (unsigned long) addr;
- if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
- arch_wb_cache_pmem(addr + bytes - 1, 1);
- }
- } else
- arch_wb_cache_pmem(addr, bytes);
-
- return len;
-}
-
-/**
- * arch_clear_pmem - zero a PMEM memory range
- * @addr: virtual start address
- * @size: number of bytes to zero
- *
- * Write zeros into the memory range starting at 'addr' for 'size' bytes.
- */
-static inline void arch_clear_pmem(void *addr, size_t size)
-{
- memset(addr, 0, size);
- arch_wb_cache_pmem(addr, size);
-}
-
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
- clflush_cache_range(addr, size);
-}
-#endif /* CONFIG_ARCH_HAS_PMEM_API */
-#endif /* __ASM_X86_PMEM_H__ */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 733bae07fb29..1f22bc277c45 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -109,6 +109,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
return 0;
}
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
+void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 6136a18152af..2bd96b4df140 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -42,8 +42,7 @@ struct saved_context {
set_debugreg((thread)->debugreg##register, register)
/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern char core_restore_code;
-extern char restore_registers;
+extern char core_restore_code[];
+extern char restore_registers[];
#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a059aac9e937..476ea27f490b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -565,7 +565,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);
-extern __must_check long strlen_user(const char __user *str);
extern __must_check long strnlen_user(const char __user *str, long n);
unsigned long __must_check clear_user(void __user *mem, unsigned long len);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c5504b9a472e..b16f6a1d8b26 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
+extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
+extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+ size_t len);
+
static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
@@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
return __copy_user_nocache(dst, src, size, 0);
}
+static inline int
+__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+ kasan_check_write(dst, size);
+ return __copy_user_flushcache(dst, src, size);
+}
+
unsigned long
copy_user_handle_tail(char *to, char *from, unsigned len);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index f6d20f6cca12..11071fcd630e 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -43,6 +43,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/smap.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
@@ -50,6 +51,8 @@
#include <xen/interface/platform.h>
#include <xen/interface/xen-mca.h>
+struct xen_dm_op_buf;
+
/*
* The hypercall asms have to meet several constraints:
* - Work on 32- and 64-bit.
@@ -214,10 +217,12 @@ privcmd_call(unsigned call,
__HYPERCALL_DECLS;
__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+ stac();
asm volatile("call *%[call]"
: __HYPERCALL_5PARAM
: [call] "a" (&hypercall_page[call])
: __HYPERCALL_CLOBBER5);
+ clac();
return (long)__res;
}
@@ -474,9 +479,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
static inline int
HYPERVISOR_dm_op(
- domid_t dom, unsigned int nr_bufs, void *bufs)
+ domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
{
- return _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ int ret;
+ stac();
+ ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ clac();
+ return ret;
}
static inline void
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 83b6e9a0dce4..da1489cb64dc 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -1,6 +1,6 @@
# UAPI Header export list
include include/uapi/asm-generic/Kbuild.asm
-genhdr-y += unistd_32.h
-genhdr-y += unistd_64.h
-genhdr-y += unistd_x32.h
+generated-y += unistd_32.h
+generated-y += unistd_64.h
+generated-y += unistd_x32.h
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f4fef5a24ebd..127ddadee1a5 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -150,6 +150,12 @@
#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9)
/*
+ * HV_VP_SET available
+ */
+#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+
+
+/*
* Crash notification flag.
*/
#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8233a630280f..dde437f5d14f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -167,7 +167,8 @@ static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (c->x86_vendor != X86_VENDOR_INTEL)
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 815dd63f49d0..cc0e8bc0ea3f 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -704,6 +704,7 @@ static const struct dma_map_ops gart_dma_ops = {
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
.mapping_error = gart_mapping_error,
+ .dma_supported = x86_dma_supported,
};
static void gart_iommu_shutdown(void)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 52000010c62e..cdf82492b770 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -21,6 +21,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
+obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000000..d869c8671e36
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,79 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct aperfmperf_sample {
+ unsigned int khz;
+ unsigned long jiffies;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+ u64 aperf, aperf_delta;
+ u64 mperf, mperf_delta;
+ struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+
+ /* Don't bother re-computing within 10 ms */
+ if (time_before(jiffies, s->jiffies + HZ/100))
+ return;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ aperf_delta = aperf - s->aperf;
+ mperf_delta = mperf - s->mperf;
+
+ /*
+ * There is no architectural guarantee that MPERF
+ * increments faster than we can read it.
+ */
+ if (mperf_delta == 0)
+ return;
+
+ /*
+ * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
+ * khz = (cpu_khz * aperf_delta) / mperf_delta
+ */
+ if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
+ s->khz = div64_u64(aperf_delta,
+ div64_u64(mperf_delta, cpu_khz));
+ s->jiffies = jiffies;
+ s->aperf = aperf;
+ s->mperf = mperf;
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+ return per_cpu(samples.khz, cpu);
+}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6df621ae62a7..218f79825b3c 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -2,7 +2,6 @@
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
-#include <linux/cpufreq.h>
/*
* Get CPU information for use by the procfs.
@@ -76,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
- if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
-
- if (!freq)
- freq = cpu_khz;
+ if (cpu_has(c, X86_FEATURE_TSC))
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
- }
+ cpu_khz / 1000, (cpu_khz % 1000));
/* Cache size */
if (c->x86_cache_size >= 0)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index fda7867046d0..5286a4a92cf7 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -50,6 +50,8 @@
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
+#define CALGARY_MAPPING_ERROR 0
+
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
#else
@@ -252,7 +254,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
- return DMA_ERROR_CODE;
+ return CALGARY_MAPPING_ERROR;
}
}
@@ -272,10 +274,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == DMA_ERROR_CODE)) {
+ if (unlikely(entry == CALGARY_MAPPING_ERROR)) {
pr_warn("failed to allocate %u pages in iommu %p\n",
npages, tbl);
- return DMA_ERROR_CODE;
+ return CALGARY_MAPPING_ERROR;
}
/* set the return dma address */
@@ -295,7 +297,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned long flags;
/* were we called with bad_dma_address? */
- badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+ badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE);
if (unlikely(dma_addr < badend)) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
@@ -380,7 +382,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == DMA_ERROR_CODE) {
+ if (entry == CALGARY_MAPPING_ERROR) {
/* makes sure unmap knows to stop */
s->dma_length = 0;
goto error;
@@ -398,7 +400,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
error:
calgary_unmap_sg(dev, sg, nelems, dir, 0);
for_each_sg(sg, s, nelems, i) {
- sg->dma_address = DMA_ERROR_CODE;
+ sg->dma_address = CALGARY_MAPPING_ERROR;
sg->dma_length = 0;
}
return 0;
@@ -453,7 +455,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
/* set up tces to cover the allocated range */
mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == DMA_ERROR_CODE)
+ if (mapping == CALGARY_MAPPING_ERROR)
goto free;
*dma_handle = mapping;
return ret;
@@ -478,6 +480,11 @@ static void calgary_free_coherent(struct device *dev, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
+static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == CALGARY_MAPPING_ERROR;
+}
+
static const struct dma_map_ops calgary_dma_ops = {
.alloc = calgary_alloc_coherent,
.free = calgary_free_coherent,
@@ -485,6 +492,8 @@ static const struct dma_map_ops calgary_dma_ops = {
.unmap_sg = calgary_unmap_sg,
.map_page = calgary_map_page,
.unmap_page = calgary_unmap_page,
+ .mapping_error = calgary_mapping_error,
+ .dma_supported = x86_dma_supported,
};
static inline void __iomem * busno_to_bbar(unsigned char num)
@@ -732,7 +741,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
+ iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
/* for CalIOC2 - avoid the entire first MB */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3a216ec869cd..5e16d3f29594 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -213,10 +213,8 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int dma_supported(struct device *dev, u64 mask)
+int x86_dma_supported(struct device *dev, u64 mask)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -224,9 +222,6 @@ int dma_supported(struct device *dev, u64 mask)
}
#endif
- if (ops->dma_supported)
- return ops->dma_supported(dev, mask);
-
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
The caller just has to use GFP_DMA in this case. */
@@ -252,7 +247,6 @@ int dma_supported(struct device *dev, u64 mask)
return 1;
}
-EXPORT_SYMBOL(dma_supported);
static int __init pci_iommu_init(void)
{
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a88952ef371c..a6d404087fe3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -11,6 +11,8 @@
#include <asm/iommu.h>
#include <asm/dma.h>
+#define NOMMU_MAPPING_ERROR 0
+
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
@@ -33,7 +35,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
- return DMA_ERROR_CODE;
+ return NOMMU_MAPPING_ERROR;
flush_write_buffers();
return bus;
}
@@ -88,6 +90,11 @@ static void nommu_sync_sg_for_device(struct device *dev,
flush_write_buffers();
}
+static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == NOMMU_MAPPING_ERROR;
+}
+
const struct dma_map_ops nommu_dma_ops = {
.alloc = dma_generic_alloc_coherent,
.free = dma_generic_free_coherent,
@@ -96,4 +103,6 @@ const struct dma_map_ops nommu_dma_ops = {
.sync_single_for_device = nommu_sync_single_for_device,
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
+ .mapping_error = nommu_mapping_error,
+ .dma_supported = x86_dma_supported,
};
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6fd40aade7c..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80890dee66ce..fb0055953fbc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE) \
goto done; \
ctxt->_eip += sizeof(_type); \
- _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
ctxt->fetch.ptr += sizeof(_type); \
_x; \
})
@@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
}
/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
* FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
* 1) 16 bit mode
* 2) 32 bit mode
@@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
static int em_fxsave(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
- size_t size;
int rc;
rc = check_fxsr(ctxt);
@@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
- size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
- else
- size = offsetof(struct fxregs_state, xmm_space[0]);
-
- return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
-}
-
-static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
- struct fxregs_state *new)
-{
- int rc = X86EMUL_CONTINUE;
- struct fxregs_state old;
-
- rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- /*
- * 64 bit host will restore XMM 8-15, which is not correct on non-64
- * bit guests. Load the current values in order to preserve 64 bit
- * XMMs after fxrstor.
- */
-#ifdef CONFIG_X86_64
- /* XXX: accessing XMM 8-15 very awkwardly */
- memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
-#endif
-
- /*
- * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
- * does save and restore MXCSR.
- */
- if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
- memcpy(new->xmm_space, old.xmm_space, 8 * 16);
-
- return rc;
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
}
static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
int rc;
+ size_t size;
rc = check_fxsr(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ ctxt->ops->get_fpu(ctxt);
- if (fx_state.mxcsr >> 16)
- return emulate_gp(ctxt, 0);
+ size = fxstate_size(ctxt);
+ if (size < __fxstate_size(16)) {
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
- ctxt->ops->get_fpu(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
- if (ctxt->mode < X86EMUL_MODE_PROT64)
- rc = fxrstor_fixup(ctxt, &fx_state);
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
if (rc == X86EMUL_CONTINUE)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+out:
ctxt->ops->put_fpu(ctxt);
return rc;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d24c8742d9b0..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
@@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
static bool start_hv_timer(struct kvm_lapic *apic)
{
- u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ int r;
- if ((atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic)) ||
- kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
- if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_timer(apic);
- } else {
- apic->lapic_timer.hv_timer_in_use = true;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ if (!kvm_x86_ops->set_hv_timer)
+ return false;
+
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return false;
- /* In case the sw timer triggered in the window */
- if (atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic))
- cancel_hv_timer(apic);
+ r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+ if (r < 0)
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * Also recheck ktimer->pending, in case the sw timer triggered in
+ * the window. For periodic timer, leave the hv timer running for
+ * simplicity, and the deadline will be recomputed on the next vmexit.
+ */
+ if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+ if (r)
+ apic_timer_expired(apic);
+ return false;
}
- trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
- apic->lapic_timer.hv_timer_in_use);
- return apic->lapic_timer.hv_timer_in_use;
+
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
}
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
- if (!start_hv_timer(apic))
- start_sw_period(apic);
+ restart_apic_timer(apic);
}
}
EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- WARN_ON(apic->lapic_timer.hv_timer_in_use);
-
- start_hv_timer(apic);
+ restart_apic_timer(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
@@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
/* Possibly the TSC deadline timer is not enabled yet */
- if (!apic->lapic_timer.hv_timer_in_use)
- return;
-
- cancel_hv_timer(apic);
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
- if (atomic_read(&apic->lapic_timer.pending))
- return;
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
- start_sw_period(apic);
- else if (apic_lvtt_tscdeadline(apic))
- start_sw_tscdeadline(apic);
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
static void start_apic_timer(struct kvm_lapic *apic)
{
atomic_set(&apic->lapic_timer.pending, 0);
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- if (set_target_expiration(apic) &&
- !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_period(apic);
- } else if (apic_lvtt_tscdeadline(apic)) {
- if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_tscdeadline(apic);
- }
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic))
+ return;
+
+ restart_apic_timer(apic);
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!lapic_in_kernel(vcpu))
- return 0;
-
- return apic->lapic_timer.tscdeadline;
-}
-
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb8225969255..aafd399cf8c6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask;
/*
- * The mask/value to distinguish a PTE that has been marked not-present for
- * access tracking purposes.
- * The mask would be either 0 if access tracking is disabled, or
- * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
*/
static u64 __read_mostly shadow_acc_track_mask;
static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
+ BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return !(spte & shadow_acc_track_value);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
static inline bool is_access_track_spte(u64 spte)
{
- /* Always false if shadow_acc_track_mask is zero. */
- return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
/*
@@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
- mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
+ mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
@@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
static bool is_mmio_spte(u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ * - Setting either @accessed_mask or @dirty_mask requires setting both
+ * - At least one of @accessed_mask or @acc_track_mask must be set
+ */
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
u64 acc_track_mask)
{
- if (acc_track_mask != 0)
- acc_track_mask |= SPTE_SPECIAL_MASK;
+ BUG_ON(!dirty_mask != !accessed_mask);
+ BUG_ON(!accessed_mask && !acc_track_mask);
+ BUG_ON(acc_track_mask & shadow_acc_track_value);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask;
- WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
is_access_track_spte(spte))
return true;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
if ((spte & shadow_accessed_mask) == 0 ||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
return true;
@@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
static bool is_accessed_spte(u64 spte)
{
- return shadow_accessed_mask ? spte & shadow_accessed_mask
- : !is_access_track_spte(spte);
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
}
static bool is_dirty_spte(u64 spte)
{
- return shadow_dirty_mask ? spte & shadow_dirty_mask
- : spte & PT_WRITABLE_MASK;
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
/* Rules for using mmu_spte_set:
@@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 mark_spte_for_access_track(u64 spte)
{
- if (shadow_accessed_mask != 0)
+ if (spte_ad_enabled(spte))
return spte & ~shadow_accessed_mask;
- if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+ if (is_access_track_spte(spte))
return spte;
/*
@@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & shadow_acc_track_saved_bits_mask) <<
shadow_acc_track_saved_bits_shift;
spte &= ~shadow_acc_track_mask;
- spte |= shadow_acc_track_value;
return spte;
}
@@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
& shadow_acc_track_saved_bits_mask;
+ WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
@@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
if (!is_accessed_spte(spte))
return false;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
} else {
@@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable)
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
@@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_clear_dirty(sptep);
+ else
+ flush |= wrprot_ad_disabled_spte(sptep);
return flush;
}
@@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_set_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_set_dirty(sptep);
return flush;
}
@@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
* @kvm: kvm instance
* @slot: slot to clear D-bit
* @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1766,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 *sptep;
struct rmap_iterator iter;
- /*
- * If there's no access bit in the secondary pte set by the hardware and
- * fast access tracking is also not enabled, it's up to gup-fast/gup to
- * set the access bit in the primary pte or in the page structure.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- goto out;
-
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
return 1;
-out:
return 0;
}
@@ -1798,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- return kvm_handle_hva_range(kvm, start, end, 0,
- kvm_unmap_rmapp);
-
return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
@@ -2398,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+ else
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
@@ -2666,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
{
u64 spte = 0;
int ret = 0;
+ struct kvm_mmu_page *sp;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
+ sp = page_header(__pa(sptep));
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+
/*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,
@@ -2678,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
*/
spte |= shadow_present_mask;
if (!speculative)
- spte |= shadow_accessed_mask;
+ spte |= spte_shadow_accessed_mask(spte);
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
@@ -2735,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (pte_access & ACC_WRITE_MASK) {
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- spte |= shadow_dirty_mask;
+ spte |= spte_shadow_dirty_mask(spte);
}
if (speculative)
@@ -2877,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
struct kvm_mmu_page *sp;
+ sp = page_header(__pa(sptep));
+
/*
- * Since it's no accessed bit on EPT, it's no way to
- * distinguish between actually accessed translations
- * and prefetched, so disable pte prefetch if EPT is
- * enabled.
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
*/
- if (!shadow_accessed_mask)
+ if (sp_ad_disabled(sp))
return;
- sp = page_header(__pa(sptep));
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return;
@@ -4290,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->base_role.word = 0;
context->base_role.smm = is_smm(vcpu);
+ context->base_role.ad_disabled = (shadow_accessed_mask == 0);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -4377,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->root_level = context->shadow_root_level;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
+ context->base_role.ad_disabled = !accessed_dirty;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
@@ -4636,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
mask.smm = 1;
+ mask.ad_disabled = 1;
/*
* If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 330bf3a811fb..a276834950c1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..8b97a6cba8d1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -30,8 +30,9 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
- " %snxe root %u %s%c", __entry->mmu_valid_gen, \
+ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \
role.quadrant, \
@@ -39,6 +40,7 @@
access_str[role.access], \
role.invalid ? " invalid" : "", \
role.nxe ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
saved_ptr; \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 33460fcdeef9..905ea6052517 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -190,6 +190,7 @@ struct vcpu_svm {
struct nested_state nested;
bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
unsigned int3_injected;
unsigned long int3_rip;
@@ -964,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_arch,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -1713,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
- return to_svm(vcpu)->vmcb->save.rflags;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
}
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
/*
* Any change of EFLAGS.VM is accompanied by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
@@ -2112,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->nmi_singlestep) {
- svm->nmi_singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ disable_nmi_singlestep(svm);
}
if (svm->vcpu.guest_debug &
@@ -2370,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
@@ -2381,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
return 1;
}
- return 0;
+ return 0;
}
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2534,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
static int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
@@ -2589,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits)
- vmexit = NESTED_EXIT_DONE;
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
/* async page fault always cause vmexit */
else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
svm->apf_reason != 0)
@@ -4627,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
+ if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+ return; /* STGI will cause a vm exit */
+
+ if (svm->nested.exit_required)
+ return; /* we're not going to run the guest yet */
+
/*
* Something prevents NMI from been injected. Single step over possible
* problem (IRET or exception injection or interrupt shadow)
*/
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
@@ -4771,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->nested.exit_required))
return;
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dcc4873e435..f76efad248ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -913,8 +913,9 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
-static u64 construct_eptp(unsigned long root_hpa);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2772,7 +2773,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
if (enable_ept_ad_bits) {
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_PML;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
} else
vmx->nested.nested_vmx_ept_caps = 0;
@@ -3198,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -3280,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -4013,7 +4019,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
} else {
vpid_sync_context(vpid);
}
@@ -4188,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
-static u64 construct_eptp(unsigned long root_hpa)
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
u64 eptp;
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
- if (enable_ept_ad_bits)
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
@@ -4209,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(cr3);
+ eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
if (is_paging(vcpu) || is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
@@ -5170,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void)
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
*/
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
+ VMX_EPT_MISCONFIG_WX_VALUE);
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -6220,17 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- if (is_guest_mode(vcpu)
- && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
- /*
- * Fix up exit_qualification according to whether guest
- * page table accesses are reads or writes.
- */
- u64 eptp = nested_ept_get_cr3(vcpu);
- if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
- exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
- }
-
/*
* EPT violation happened while executing iret from NMI,
* "blocked by NMI" bit has to be set before next VM entry.
@@ -6453,7 +6450,7 @@ void vmx_enable_tdp(void)
enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
0ull, VMX_EPT_EXECUTABLE_MASK,
cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+ VMX_EPT_RWX_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
@@ -6557,7 +6554,6 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -7661,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
unsigned long type, types;
gva_t gva;
struct x86_exception e;
- int vpid;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
if (!(vmx->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -7691,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
- sizeof(u32), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+ sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
+ if (operand.vpid >> 16) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (is_noncanonical_address(operand.gla)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ /* fall through */
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
- if (!vpid) {
+ if (!operand.vpid) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
@@ -9394,6 +9404,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vmcs12->guest_physical_address = fault->address;
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
+}
+
/* Callbacks for nested_ept_init_mmu_context: */
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
@@ -9404,18 +9419,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- u64 eptp;
+ bool wants_ad;
WARN_ON(mmu_is_nested(vcpu));
- eptp = nested_ept_get_cr3(vcpu);
- if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+ wants_ad = nested_ept_ad_enabled(vcpu);
+ if (wants_ad && !enable_ept_ad_bits)
return 1;
kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.nested_vmx_ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
- eptp & VMX_EPT_AD_ENABLE_BIT);
+ wants_ad);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -10728,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
- if (nested_cpu_has_ept(vmcs12))
- vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
if (nested_cpu_has_vid(vmcs12))
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10754,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
- if (nested_cpu_has_xsaves(vmcs12))
- vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
}
/*
@@ -11152,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
vmx->hv_deadline_tsc = tscl + delta_tsc;
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_VMX_PREEMPTION_TIMER);
- return 0;
+
+ return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0cb83b..6c7266f7766d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
- if (kvm_lapic_hv_timer_in_use(vcpu) &&
- kvm_x86_ops->set_hv_timer(vcpu,
- kvm_get_lapic_target_expiration_tsc(vcpu)))
- kvm_lapic_switch_to_sw_timer(vcpu);
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
@@ -6011,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void)
mask &= ~1ull;
#endif
- kvm_mmu_set_mmio_spte_mask(mask);
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
}
#ifdef CONFIG_X86_64
@@ -6733,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- if (vcpu->requests) {
+ if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6897,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->sync_pir_to_irr(vcpu);
}
- if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
+ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 3b7c40a2e3e1..75d3776123cc 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -7,6 +7,7 @@
*/
#include <linux/export.h>
#include <linux/uaccess.h>
+#include <linux/highmem.h>
/*
* Zero Userspace
@@ -73,3 +74,136 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
clac();
return len;
}
+
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+/**
+ * clean_cache_range - write back a cache range with CLWB
+ * @vaddr: virtual start address
+ * @size: number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction. Note that @size is internally rounded up to be cache
+ * line size aligned.
+ */
+static void clean_cache_range(void *addr, size_t size)
+{
+ u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+ unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vend = addr + size;
+ void *p;
+
+ for (p = (void *)((unsigned long)addr & ~clflush_mask);
+ p < vend; p += x86_clflush_size)
+ clwb(p);
+}
+
+void arch_wb_cache_pmem(void *addr, size_t size)
+{
+ clean_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
+
+long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+ unsigned long flushed, dest = (unsigned long) dst;
+ long rc = __copy_user_nocache(dst, src, size, 0);
+
+ /*
+ * __copy_user_nocache() uses non-temporal stores for the bulk
+ * of the transfer, but we need to manually flush if the
+ * transfer is unaligned. A cached memory copy is used when
+ * destination or size is not naturally aligned. That is:
+ * - Require 8-byte alignment when size is 8 bytes or larger.
+ * - Require 4-byte alignment when size is 4 bytes.
+ */
+ if (size < 8) {
+ if (!IS_ALIGNED(dest, 4) || size != 4)
+ clean_cache_range(dst, 1);
+ } else {
+ if (!IS_ALIGNED(dest, 8)) {
+ dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
+ clean_cache_range(dst, 1);
+ }
+
+ flushed = dest - (unsigned long) dst;
+ if (size > flushed && !IS_ALIGNED(size - flushed, 8))
+ clean_cache_range(dst + size - 1, 1);
+ }
+
+ return rc;
+}
+
+void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+{
+ unsigned long dest = (unsigned long) _dst;
+ unsigned long source = (unsigned long) _src;
+
+ /* cache copy and flush to align dest */
+ if (!IS_ALIGNED(dest, 8)) {
+ unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+
+ memcpy((void *) dest, (void *) source, len);
+ clean_cache_range((void *) dest, len);
+ dest += len;
+ source += len;
+ size -= len;
+ if (!size)
+ return;
+ }
+
+ /* 4x8 movnti loop */
+ while (size >= 32) {
+ asm("movq (%0), %%r8\n"
+ "movq 8(%0), %%r9\n"
+ "movq 16(%0), %%r10\n"
+ "movq 24(%0), %%r11\n"
+ "movnti %%r8, (%1)\n"
+ "movnti %%r9, 8(%1)\n"
+ "movnti %%r10, 16(%1)\n"
+ "movnti %%r11, 24(%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8", "r9", "r10", "r11");
+ dest += 32;
+ source += 32;
+ size -= 32;
+ }
+
+ /* 1x8 movnti loop */
+ while (size >= 8) {
+ asm("movq (%0), %%r8\n"
+ "movnti %%r8, (%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8");
+ dest += 8;
+ source += 8;
+ size -= 8;
+ }
+
+ /* 1x4 movnti loop */
+ while (size >= 4) {
+ asm("movl (%0), %%r8d\n"
+ "movnti %%r8d, (%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8");
+ dest += 4;
+ source += 4;
+ size -= 4;
+ }
+
+ /* cache copy for remaining bytes */
+ if (size) {
+ memcpy((void *) dest, (void *) source, size);
+ clean_cache_range((void *) dest, size);
+ }
+}
+EXPORT_SYMBOL_GPL(memcpy_flushcache);
+
+void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+ size_t len)
+{
+ char *from = kmap_atomic(page);
+
+ memcpy_flushcache(to, from + offset, len);
+ kunmap_atomic(from);
+}
+#endif
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index adad702b39cd..2824607df108 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
if (!vma || !is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
- pte = huge_pte_offset(mm, address);
+ pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
/* hugetlb should be locked, and hence, prefaulted */
WARN_ON(!pte || pte_none(*pte));
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 99fb83819a5f..8a64a6f2848d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -823,15 +823,12 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
{
- struct pglist_data *pgdata = NODE_DATA(nid);
- struct zone *zone = pgdata->node_zones +
- zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(nid, zone, start_pfn, nr_pages);
+ return __add_pages(nid, start_pfn, nr_pages, want_memblock);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index dae6a5e5ad4a..136422d7d539 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-/*
- * Memory is added always to NORMAL zone. This means you will never get
- * additional DMA/DMA32 memory.
- */
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
{
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
init_memory_mapping(start, start + size);
- ret = __add_pages(nid, zone, start_pfn, nr_pages);
+ ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c8520b2c62d2..757b0bcdf712 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -150,6 +150,12 @@ void clflush_cache_range(void *vaddr, unsigned int size)
}
EXPORT_SYMBOL_GPL(clflush_cache_range);
+void arch_invalidate_pmem(void *addr, size_t size)
+{
+ clflush_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
+
static void __cpa_flush_all(void *arg)
{
unsigned long cache = (unsigned long)arg;
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index f2a7faf4706e..b33093f84528 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -19,9 +19,6 @@
*/
#define SKBDATA %r10
#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
-#define MAX_BPF_STACK (512 /* from filter.h */ + \
- 32 /* space for rbx,r13,r14,r15 */ + \
- 8 /* space for skb_copy_bits */)
#define FUNC(name) \
.globl name; \
@@ -66,7 +63,7 @@ FUNC(sk_load_byte_positive_offset)
/* rsi contains offset and can be scratched */
#define bpf_slow_path_common(LEN) \
- lea -MAX_BPF_STACK + 32(%rbp), %rdx;\
+ lea 32(%rbp), %rdx;\
FRAME_BEGIN; \
mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
@@ -83,14 +80,14 @@ FUNC(sk_load_byte_positive_offset)
bpf_slow_path_word:
bpf_slow_path_common(4)
js bpf_error
- mov - MAX_BPF_STACK + 32(%rbp),%eax
+ mov 32(%rbp),%eax
bswap %eax
ret
bpf_slow_path_half:
bpf_slow_path_common(2)
js bpf_error
- mov - MAX_BPF_STACK + 32(%rbp),%ax
+ mov 32(%rbp),%ax
rol $8,%ax
movzwl %ax,%eax
ret
@@ -98,7 +95,7 @@ bpf_slow_path_half:
bpf_slow_path_byte:
bpf_slow_path_common(1)
js bpf_error
- movzbl - MAX_BPF_STACK + 32(%rbp),%eax
+ movzbl 32(%rbp),%eax
ret
#define sk_negative_common(SIZE) \
@@ -148,9 +145,10 @@ FUNC(sk_load_byte_negative_offset)
bpf_error:
# force a return 0 from jit handler
xor %eax,%eax
- mov - MAX_BPF_STACK(%rbp),%rbx
- mov - MAX_BPF_STACK + 8(%rbp),%r13
- mov - MAX_BPF_STACK + 16(%rbp),%r14
- mov - MAX_BPF_STACK + 24(%rbp),%r15
+ mov (%rbp),%rbx
+ mov 8(%rbp),%r13
+ mov 16(%rbp),%r14
+ mov 24(%rbp),%r15
+ add $40, %rbp
leaveq
ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f58939393eef..e1324f280e06 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -197,17 +197,16 @@ struct jit_context {
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
-#define STACKSIZE \
- (MAX_BPF_STACK + \
- 32 /* space for rbx, r13, r14, r15 */ + \
+#define AUX_STACK_SPACE \
+ (32 /* space for rbx, r13, r14, r15 */ + \
8 /* space for skb_copy_bits() buffer */)
-#define PROLOGUE_SIZE 48
+#define PROLOGUE_SIZE 37
/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
*/
-static void emit_prologue(u8 **pprog)
+static void emit_prologue(u8 **pprog, u32 stack_depth)
{
u8 *prog = *pprog;
int cnt = 0;
@@ -215,13 +214,17 @@ static void emit_prologue(u8 **pprog)
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
- /* sub rsp, STACKSIZE */
- EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
+ /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
+ EMIT3_off32(0x48, 0x81, 0xEC,
+ round_up(stack_depth, 8) + AUX_STACK_SPACE);
+
+ /* sub rbp, AUX_STACK_SPACE */
+ EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
/* all classic BPF filters use R6(rbx) save it */
- /* mov qword ptr [rbp-X],rbx */
- EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
+ /* mov qword ptr [rbp+0],rbx */
+ EMIT4(0x48, 0x89, 0x5D, 0);
/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -231,12 +234,12 @@ static void emit_prologue(u8 **pprog)
* than synthetic ones. Therefore not worth adding complexity.
*/
- /* mov qword ptr [rbp-X],r13 */
- EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
- /* mov qword ptr [rbp-X],r14 */
- EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
- /* mov qword ptr [rbp-X],r15 */
- EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
+ /* mov qword ptr [rbp+8],r13 */
+ EMIT4(0x4C, 0x89, 0x6D, 8);
+ /* mov qword ptr [rbp+16],r14 */
+ EMIT4(0x4C, 0x89, 0x75, 16);
+ /* mov qword ptr [rbp+24],r15 */
+ EMIT4(0x4C, 0x89, 0x7D, 24);
/* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
* we need to reset the counter to 0. It's done in two instructions,
@@ -246,8 +249,8 @@ static void emit_prologue(u8 **pprog)
/* xor eax, eax */
EMIT2(0x31, 0xc0);
- /* mov qword ptr [rbp-X], rax */
- EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+ /* mov qword ptr [rbp+32], rax */
+ EMIT4(0x48, 0x89, 0x45, 32);
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
*pprog = prog;
@@ -289,13 +292,13 @@ static void emit_bpf_tail_call(u8 **pprog)
/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
*/
- EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
#define OFFSET2 36
EMIT2(X86_JA, OFFSET2); /* ja out */
label2 = cnt;
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
- EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+ EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */
/* prog = array->ptrs[index]; */
EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
@@ -361,7 +364,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int proglen = 0;
u8 *prog = temp;
- emit_prologue(&prog);
+ emit_prologue(&prog, bpf_prog->aux->stack_depth);
if (seen_ld_abs)
emit_load_skb_data_hlen(&prog);
@@ -877,7 +880,7 @@ xadd: if (is_imm8(insn->off))
}
break;
- case BPF_JMP | BPF_CALL | BPF_X:
+ case BPF_JMP | BPF_TAIL_CALL:
emit_bpf_tail_call(&prog);
break;
@@ -1036,15 +1039,17 @@ common_load:
seen_exit = true;
/* update cleanup_addr */
ctx->cleanup_addr = proglen;
- /* mov rbx, qword ptr [rbp-X] */
- EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
- /* mov r13, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
- /* mov r14, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
- /* mov r15, qword ptr [rbp-X] */
- EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
-
+ /* mov rbx, qword ptr [rbp+0] */
+ EMIT4(0x48, 0x8B, 0x5D, 0);
+ /* mov r13, qword ptr [rbp+8] */
+ EMIT4(0x4C, 0x8B, 0x6D, 8);
+ /* mov r14, qword ptr [rbp+16] */
+ EMIT4(0x4C, 0x8B, 0x75, 16);
+ /* mov r15, qword ptr [rbp+24] */
+ EMIT4(0x4C, 0x8B, 0x7D, 24);
+
+ /* add rbp, AUX_STACK_SPACE */
+ EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE);
EMIT1(0xC9); /* leave */
EMIT1(0xC3); /* ret */
break;
@@ -1162,6 +1167,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)image;
prog->jited = 1;
+ prog->jited_len = proglen;
} else {
prog = orig_prog;
}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cfd1a89fd04e..dbe2132b0ed4 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -24,7 +24,6 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
-static int smbios_type_b1_flag;
int pci_routeirq;
int noioapicquirk;
#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -197,34 +196,18 @@ static int __init set_bf_sort(const struct dmi_system_id *d)
static void __init read_dmi_type_b1(const struct dmi_header *dm,
void *private_data)
{
- u8 *d = (u8 *)dm + 4;
+ u8 *data = (u8 *)dm + 4;
if (dm->type != 0xB1)
return;
- switch (((*(u32 *)d) >> 9) & 0x03) {
- case 0x00:
- printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
- break;
- case 0x01: /* set pci=bfsort */
- smbios_type_b1_flag = 1;
- break;
- case 0x02: /* do not set pci=bfsort */
- smbios_type_b1_flag = 2;
- break;
- default:
- break;
- }
+ if ((((*(u32 *)data) >> 9) & 0x03) == 0x01)
+ set_bf_sort((const struct dmi_system_id *)private_data);
}
static int __init find_sort_method(const struct dmi_system_id *d)
{
- dmi_walk(read_dmi_type_b1, NULL);
-
- if (smbios_type_b1_flag == 1) {
- set_bf_sort(d);
- return 0;
- }
- return -1;
+ dmi_walk(read_dmi_type_b1, (void *)d);
+ return 0;
}
/*
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6d52b94f4bb9..11e407489db0 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -571,3 +571,50 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
+
+/*
+ * Device [1022:7808]
+ * 23. USB Wake on Connect/Disconnect with Low Speed Devices
+ * https://support.amd.com/TechDocs/46837.pdf
+ * Appendix A2
+ * https://support.amd.com/TechDocs/42413.pdf
+ */
+static void pci_fixup_amd_ehci_pme(struct pci_dev *dev)
+{
+ dev_info(&dev->dev, "PME# does not work under D3, disabling it\n");
+ dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold)
+ >> PCI_PM_CAP_PME_SHIFT);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme);
+
+/*
+ * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff]
+ *
+ * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to
+ * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used
+ * for soft poweroff and suspend-to-RAM.
+ *
+ * As far as we know, this is related to the address space, not to the Root
+ * Port itself. Attaching the quirk to the Root Port is a convenience, but
+ * it could probably also be a standalone DMI quirk.
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=103211
+ */
+static void quirk_apple_mbp_poweroff(struct pci_dev *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+
+ if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") &&
+ !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) ||
+ pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0))
+ return;
+
+ res = request_mem_region(0x7fa00000, 0x200000,
+ "MacBook Pro poweroff workaround");
+ if (res)
+ dev_info(dev, "claimed %s %pR\n", res->name, res);
+ else
+ dev_info(dev, "can't work around MacBook Pro poweroff issue\n");
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index c1bdb9edcae7..76595408ff53 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -46,7 +46,7 @@ static inline void set_bios_x(void)
pcibios_enabled = 1;
set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
if (__supported_pte_mask & _PAGE_NX)
- printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
+ printk(KERN_INFO "PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
}
/*
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index ec008e800b45..53d600217973 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -26,6 +26,7 @@
#include <linux/pci_ids.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <asm/iommu.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
extern int swiotlb_late_init_with_default_size(size_t default_size);
@@ -191,7 +192,7 @@ static const struct dma_map_ops sta2x11_dma_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
- .dma_supported = NULL, /* FIXME: we should use this instead! */
+ .dma_supported = x86_dma_supported,
};
/* At setup time, we use our own ops if the device is a ConneXt one */
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index e3e62c8a8e70..f2598d81cd55 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -147,7 +147,7 @@ static int relocate_restore_code(void)
if (!relocated_restore_code)
return -ENOMEM;
- memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
+ memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3_pa()) +
@@ -293,8 +293,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
if (max_size < sizeof(struct restore_data_record))
return -EOVERFLOW;
- rdr->jump_address = (unsigned long)&restore_registers;
- rdr->jump_address_phys = __pa_symbol(&restore_registers);
+ rdr->jump_address = (unsigned long)restore_registers;
+ rdr->jump_address_phys = __pa_symbol(restore_registers);
rdr->cr3 = restore_cr3;
rdr->magic = RESTORE_MAGIC;
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index d72dec406ccb..329406224330 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -53,7 +53,7 @@ CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage
CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage
targets += vdso-syms.lds
-obj-$(VDSO64-y) += vdso-syms.lds
+extra-$(VDSO64-y) += vdso-syms.lds
#
# Match symbols in the DSO that look like VDSO*; produce a file of constants.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a5ffcbb20cc0..0e7ef69e8531 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -106,15 +106,83 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
}
-static void clamp_max_cpus(void)
+static int xen_vcpu_setup_restore(int cpu)
{
-#ifdef CONFIG_SMP
- if (setup_max_cpus > MAX_VIRT_CPUS)
- setup_max_cpus = MAX_VIRT_CPUS;
-#endif
+ int rc = 0;
+
+ /* Any per_cpu(xen_vcpu) is stale, so reset it */
+ xen_vcpu_info_reset(cpu);
+
+ /*
+ * For PVH and PVHVM, setup online VCPUs only. The rest will
+ * be handled by hotplug.
+ */
+ if (xen_pv_domain() ||
+ (xen_hvm_domain() && cpu_online(cpu))) {
+ rc = xen_vcpu_setup(cpu);
+ }
+
+ return rc;
+}
+
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+ int cpu, rc;
+
+ for_each_possible_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+ bool is_up;
+
+ if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID)
+ continue;
+
+ /* Only Xen 4.5 and higher support this. */
+ is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up,
+ xen_vcpu_nr(cpu), NULL) > 0;
+
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
+ BUG();
+
+ if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_runstate_info(cpu);
+
+ rc = xen_vcpu_setup_restore(cpu);
+ if (rc)
+ pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
+ "System will hang.\n", cpu, rc);
+ /*
+ * In case xen_vcpu_setup_restore() fails, do not bring up the
+ * VCPU. This helps us avoid the resulting OOPS when the VCPU
+ * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
+ * Note that this does not improve the situation much -- now the
+ * VM hangs instead of OOPSing -- with the VCPUs that did not
+ * fail, spinning in stop_machine(), waiting for the failed
+ * VCPUs to come up.
+ */
+ if (other_cpu && is_up && (rc == 0) &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
+ BUG();
+ }
}
-void xen_vcpu_setup(int cpu)
+void xen_vcpu_info_reset(int cpu)
+{
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
+ per_cpu(xen_vcpu, cpu) =
+ &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+ } else {
+ /* Set to NULL so that if somebody accesses it we get an OOPS */
+ per_cpu(xen_vcpu, cpu) = NULL;
+ }
+}
+
+int xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
int err;
@@ -123,11 +191,11 @@ void xen_vcpu_setup(int cpu)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
- * This path is called twice on PVHVM - first during bootup via
- * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
- * hotplugged: cpu_up -> xen_hvm_cpu_notify.
- * As we can only do the VCPUOP_register_vcpu_info once lets
- * not over-write its result.
+ * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu)
+ * and at restore (xen_vcpu_restore). Also called for hotplugged
+ * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm).
+ * However, the hypercall can only be done once (see below) so if a VCPU
+ * is offlined and comes back online then let's not redo the hypercall.
*
* For PV it is called during restore (xen_vcpu_restore) and bootup
* (xen_setup_vcpu_info_placement). The hotplug mechanism does not
@@ -135,42 +203,44 @@ void xen_vcpu_setup(int cpu)
*/
if (xen_hvm_domain()) {
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
- return;
+ return 0;
}
- if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
- per_cpu(xen_vcpu, cpu) =
- &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
- if (!xen_have_vcpu_info_placement) {
- if (cpu >= MAX_VIRT_CPUS)
- clamp_max_cpus();
- return;
+ if (xen_have_vcpu_info_placement) {
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
+
+ /*
+ * Check to see if the hypervisor will put the vcpu_info
+ * structure where we want it, which allows direct access via
+ * a percpu-variable.
+ * N.B. This hypercall can _only_ be called once per CPU.
+ * Subsequent calls will error out with -EINVAL. This is due to
+ * the fact that hypervisor has no unregister variant and this
+ * hypercall does not allow to over-write info.mfn and
+ * info.offset.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
+ xen_vcpu_nr(cpu), &info);
+
+ if (err) {
+ pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
+ cpu, err);
+ xen_have_vcpu_info_placement = 0;
+ } else {
+ /*
+ * This cpu is using the registered vcpu info, even if
+ * later ones fail to.
+ */
+ per_cpu(xen_vcpu, cpu) = vcpup;
+ }
}
- vcpup = &per_cpu(xen_vcpu_info, cpu);
- info.mfn = arbitrary_virt_to_mfn(vcpup);
- info.offset = offset_in_page(vcpup);
-
- /* Check to see if the hypervisor will put the vcpu_info
- structure where we want it, which allows direct access via
- a percpu-variable.
- N.B. This hypercall can _only_ be called once per CPU. Subsequent
- calls will error out with -EINVAL. This is due to the fact that
- hypervisor has no unregister variant and this hypercall does not
- allow to over-write info.mfn and info.offset.
- */
- err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu),
- &info);
+ if (!xen_have_vcpu_info_placement)
+ xen_vcpu_info_reset(cpu);
- if (err) {
- printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
- xen_have_vcpu_info_placement = 0;
- clamp_max_cpus();
- } else {
- /* This cpu is using the registered vcpu info, even if
- later ones fail to. */
- per_cpu(xen_vcpu, cpu) = vcpup;
- }
+ return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
}
void xen_reboot(int reason)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index a6d014f47e52..87d791356ea9 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -1,5 +1,6 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <xen/features.h>
#include <xen/events.h>
@@ -10,9 +11,11 @@
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/hypervisor.h>
+#include <asm/e820/api.h>
#include <asm/xen/cpuid.h>
#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -20,37 +23,34 @@
void __ref xen_hvm_init_shared_info(void)
{
- int cpu;
struct xen_add_to_physmap xatp;
- static struct shared_info *shared_info_page;
+ u64 pa;
+
+ if (HYPERVISOR_shared_info == &xen_dummy_shared_info) {
+ /*
+ * Search for a free page starting at 4kB physical address.
+ * Low memory is preferred to avoid an EPT large page split up
+ * by the mapping.
+ * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+ * the BIOS used for HVM guests is well behaved and won't
+ * clobber memory other than the first 4kB.
+ */
+ for (pa = PAGE_SIZE;
+ !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+ memblock_is_reserved(pa);
+ pa += PAGE_SIZE)
+ ;
+
+ memblock_reserve(pa, PAGE_SIZE);
+ HYPERVISOR_shared_info = __va(pa);
+ }
- if (!shared_info_page)
- shared_info_page = (struct shared_info *)
- extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info);
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
-
- HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
-
- /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
- * page, we use it in the event channel upcall and in some pvclock
- * related functions. We don't need the vcpu_info placement
- * optimizations because we don't use any pv_mmu or pv_irq op on
- * HVM.
- * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
- * online but xen_hvm_init_shared_info is run at resume time too and
- * in that case multiple vcpus might be online. */
- for_each_online_cpu(cpu) {
- /* Leave it to be NULL. */
- if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS)
- continue;
- per_cpu(xen_vcpu, cpu) =
- &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
- }
}
static void __init init_hvm_pv_info(void)
@@ -106,7 +106,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs)
static int xen_cpu_up_prepare_hvm(unsigned int cpu)
{
- int rc;
+ int rc = 0;
/*
* This can happen if CPU was offlined earlier and
@@ -121,7 +121,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
- xen_vcpu_setup(cpu);
+ rc = xen_vcpu_setup(cpu);
+ if (rc)
+ return rc;
if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
@@ -130,9 +132,8 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
if (rc) {
WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
cpu, rc);
- return rc;
}
- return 0;
+ return rc;
}
static int xen_cpu_dead_hvm(unsigned int cpu)
@@ -154,6 +155,13 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_shared_info();
+ /*
+ * xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions.
+ */
+ xen_vcpu_info_reset(0);
+
xen_panic_handler_init();
if (xen_feature(XENFEAT_hvm_callback_vector))
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f33eef4ebd12..811e4ddb3f37 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -89,8 +89,6 @@
void *xen_initial_gdt;
-RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
-
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
@@ -107,35 +105,6 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
-/*
- * On restore, set the vcpu placement up again.
- * If it fails, then we're in a bad state, since
- * we can't back out from using it...
- */
-void xen_vcpu_restore(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- bool other_cpu = (cpu != smp_processor_id());
- bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
- NULL);
-
- if (other_cpu && is_up &&
- HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
- BUG();
-
- xen_setup_runstate_info(cpu);
-
- if (xen_have_vcpu_info_placement)
- xen_vcpu_setup(cpu);
-
- if (other_cpu && is_up &&
- HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
- BUG();
- }
-}
-
static void __init xen_banner(void)
{
unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
@@ -960,30 +929,43 @@ void xen_setup_shared_info(void)
HYPERVISOR_shared_info =
(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-#ifndef CONFIG_SMP
- /* In UP this is as good a place as any to set up shared info */
- xen_setup_vcpu_info_placement();
-#endif
-
xen_setup_mfn_list_list();
- /*
- * Now that shared info is set up we can start using routines that
- * point to pvclock area.
- */
- if (system_state == SYSTEM_BOOTING)
+ if (system_state == SYSTEM_BOOTING) {
+#ifndef CONFIG_SMP
+ /*
+ * In UP this is as good a place as any to set up shared info.
+ * Limit this to boot only, at restore vcpu setup is done via
+ * xen_vcpu_restore().
+ */
+ xen_setup_vcpu_info_placement();
+#endif
+ /*
+ * Now that shared info is set up we can start using routines
+ * that point to pvclock area.
+ */
xen_init_time_ops();
+ }
}
/* This is called once we have the cpu_possible_mask */
-void xen_setup_vcpu_info_placement(void)
+void __ref xen_setup_vcpu_info_placement(void)
{
int cpu;
for_each_possible_cpu(cpu) {
/* Set up direct vCPU id mapping for PV guests. */
per_cpu(xen_vcpu_id, cpu) = cpu;
- xen_vcpu_setup(cpu);
+
+ /*
+ * xen_vcpu_setup(cpu) can fail -- in which case it
+ * falls back to the shared_info version for cpus
+ * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
+ *
+ * xen_cpu_up_prepare_pv() handles the rest by failing
+ * them in hotplug.
+ */
+ (void) xen_vcpu_setup(cpu);
}
/*
@@ -1332,9 +1314,17 @@ asmlinkage __visible void __init xen_start_kernel(void)
*/
acpi_numa = -1;
#endif
- /* Don't do the full vcpu_info placement stuff until we have a
- possible map and a non-dummy shared_info. */
- per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+ /* Let's presume PV guests always boot on vCPU with id 0. */
+ per_cpu(xen_vcpu_id, 0) = 0;
+
+ /*
+ * Setup xen_vcpu early because start_kernel needs it for
+ * local_irq_disable(), irqs_disabled().
+ *
+ * Don't do the full vcpu_info placement stuff until we have
+ * the cpu_possible_mask and a non-dummy shared_info.
+ */
+ xen_vcpu_info_reset(0);
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
@@ -1431,9 +1421,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
xen_raw_console_write("about to get started...\n");
- /* Let's presume PV guests always boot on vCPU with id 0. */
- per_cpu(xen_vcpu_id, 0) = 0;
-
+ /* We need this for printk timestamps */
xen_setup_runstate_info(0);
xen_efi_init();
@@ -1451,6 +1439,9 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu)
{
int rc;
+ if (per_cpu(xen_vcpu, cpu) == NULL)
+ return -ENODEV;
+
xen_setup_timer(cpu);
rc = xen_smp_intr_init(cpu);
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 42b08f8fc2ca..37c6056a7bba 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -18,20 +18,6 @@
int xen_swiotlb __read_mostly;
-static const struct dma_map_ops xen_swiotlb_dma_ops = {
- .alloc = xen_swiotlb_alloc_coherent,
- .free = xen_swiotlb_free_coherent,
- .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
- .sync_single_for_device = xen_swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
- .map_sg = xen_swiotlb_map_sg_attrs,
- .unmap_sg = xen_swiotlb_unmap_sg_attrs,
- .map_page = xen_swiotlb_map_page,
- .unmap_page = xen_swiotlb_unmap_page,
- .dma_supported = xen_swiotlb_dma_supported,
-};
-
/*
* pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
*
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a5bf7c451435..c81046323ebc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -499,7 +499,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
void __init xen_remap_memory(void)
{
unsigned long buf = (unsigned long)&xen_remap_buf;
- unsigned long mfn_save, mfn, pfn;
+ unsigned long mfn_save, pfn;
unsigned long remapped = 0;
unsigned int i;
unsigned long pfn_s = ~0UL;
@@ -515,8 +515,7 @@ void __init xen_remap_memory(void)
pfn = xen_remap_buf.target_pfn;
for (i = 0; i < xen_remap_buf.size; i++) {
- mfn = xen_remap_buf.mfns[i];
- xen_update_mem_tables(pfn, mfn);
+ xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
remapped++;
pfn++;
}
@@ -530,8 +529,6 @@ void __init xen_remap_memory(void)
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
-
- mfn = xen_remap_mfn;
xen_remap_mfn = xen_remap_buf.next_area_mfn;
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 82ac611f2fc1..e7f02eb73727 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -1,4 +1,5 @@
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
@@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu)
return rc;
}
+void __init xen_smp_cpus_done(unsigned int max_cpus)
+{
+ int cpu, rc, count = 0;
+
+ if (xen_hvm_domain())
+ native_smp_cpus_done(max_cpus);
+
+ if (xen_have_vcpu_info_placement)
+ return;
+
+ for_each_online_cpu(cpu) {
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
+ continue;
+
+ rc = cpu_down(cpu);
+
+ if (rc == 0) {
+ /*
+ * Reset vcpu_info so this cpu cannot be onlined again.
+ */
+ xen_vcpu_info_reset(cpu);
+ count++;
+ } else {
+ pr_warn("%s: failed to bring CPU %d down, error %d\n",
+ __func__, cpu, rc);
+ }
+ }
+ WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
+}
+
void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 8ebb6acca64a..87d3c76cba37 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu);
int xen_smp_intr_init_pv(unsigned int cpu);
void xen_smp_intr_free_pv(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(const struct cpumask *mask);
void xen_smp_send_call_function_single_ipi(int cpu);
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index f18561bbf5c9..fd60abedf658 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -12,7 +12,8 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
native_smp_prepare_boot_cpu();
/*
- * Setup vcpu_info for boot CPU.
+ * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info
+ * in xen_cpu_up_prepare_hvm().
*/
xen_vcpu_setup(0);
@@ -27,10 +28,20 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
{
+ int cpu;
+
native_smp_prepare_cpus(max_cpus);
WARN_ON(xen_smp_intr_init(0));
xen_init_lock_cpu(0);
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+
+ /* Set default vcpu_id to make sure that we don't use cpu-0's */
+ per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID;
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -60,4 +71,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+ smp_ops.smp_cpus_done = xen_smp_cpus_done;
}
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index aae32535f4ec..1ea598e5f030 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -371,10 +371,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static void xen_pv_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static int xen_pv_cpu_disable(void)
{
@@ -469,7 +465,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
- .smp_cpus_done = xen_pv_smp_cpus_done,
+ .smp_cpus_done = xen_smp_cpus_done,
.cpu_up = xen_pv_cpu_up,
.cpu_die = xen_pv_cpu_die,
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index 01afcadde50a..484999416d8b 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -8,15 +8,10 @@
void xen_hvm_post_suspend(int suspend_cancelled)
{
- int cpu;
-
- if (!suspend_cancelled)
+ if (!suspend_cancelled) {
xen_hvm_init_shared_info();
+ xen_vcpu_restore();
+ }
xen_callback_vector();
xen_unplug_emulated_devices();
- if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
- for_each_online_cpu(cpu) {
- xen_setup_runstate_info(cpu);
- }
- }
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a440a42c618..0d5004477db6 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -78,7 +78,8 @@ bool xen_vcpu_stolen(int vcpu);
extern int xen_have_vcpu_info_placement;
-void xen_vcpu_setup(int cpu);
+int xen_vcpu_setup(int cpu);
+void xen_vcpu_info_reset(int cpu);
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
OpenPOWER on IntegriCloud