diff options
Diffstat (limited to 'arch/x86')
52 files changed, 686 insertions, 518 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f73071742975..25d2c6f7325e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -43,6 +43,7 @@ config X86 select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES + select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -128,6 +129,7 @@ config X86 select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_CC_STACKPROTECTOR select GENERIC_CPU_AUTOPROBE + select HAVE_ARCH_AUDITSYSCALL config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3b9348a0c1a4..d1b7c377a234 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -108,7 +108,7 @@ else # this works around some issues with generating unwind tables in older gccs # newer gccs do it by default - KBUILD_CFLAGS += -maccumulate-outgoing-args + KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) endif # Make sure compiler does not have buggy stack-protector support. @@ -250,8 +250,8 @@ archclean: PHONY += kvmconfig kvmconfig: $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) oldconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 1e6146137f8e..4703a6c4b8e3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -112,7 +112,7 @@ __file_size64(void *__fh, efi_char16_t *filename_16, efi_file_info_t *info; efi_status_t status; efi_guid_t info_guid = EFI_FILE_INFO_ID; - u32 info_sz; + u64 info_sz; status = efi_early->call((unsigned long)fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); @@ -167,31 +167,31 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh, } static inline efi_status_t -efi_file_read(void *__fh, void *handle, unsigned long *size, void *addr) +efi_file_read(void *handle, unsigned long *size, void *addr) { unsigned long func; if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; func = (unsigned long)fh->read; return efi_early->call(func, handle, size, addr); } } -static inline efi_status_t efi_file_close(void *__fh, void *handle) +static inline efi_status_t efi_file_close(void *handle) { if (efi_early->is64) { - efi_file_handle_64_t *fh = __fh; + efi_file_handle_64_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } else { - efi_file_handle_32_t *fh = __fh; + efi_file_handle_32_t *fh = handle; return efi_early->call((unsigned long)fh->close, handle); } @@ -1016,6 +1016,9 @@ void setup_graphics(struct boot_params *boot_params) * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create * one for us). + * + * The caller is responsible for filling out ->code32_start in the + * returned boot_params. */ struct boot_params *make_boot_params(struct efi_config *c) { @@ -1081,8 +1084,6 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr->vid_mode = 0xffff; hdr->boot_flag = 0xAA55; - hdr->code32_start = (__u64)(unsigned long)image->image_base; - hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index de9d4200d305..cbed1407a5cd 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -59,6 +59,7 @@ ENTRY(efi_pe_entry) call make_boot_params cmpl $0, %eax je fail + movl %esi, BP_code32_start(%eax) popl %ecx pushl %eax pushl %ecx @@ -90,12 +91,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popl %eax - subl $3b, %eax - subl BP_pref_address(%esi), %eax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leal preferred_addr(%eax), %eax jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 57e58a5fa210..0d558ee899ae 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -261,6 +261,8 @@ ENTRY(efi_pe_entry) cmpq $0,%rax je fail mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) jmp 2f /* Skip the relocation */ handover_entry: @@ -284,12 +286,7 @@ fail: hlt jmp fail 2: - call 3f -3: - popq %rax - subq $3b, %rax - subq BP_pref_address(%rsi), %rax - add BP_code32_start(%esi), %eax + movl BP_code32_start(%esi), %eax leaq preferred_addr(%rax), %rax jmp *%rax diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4acddc43ee0c..3ca9762e1649 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,5 +5,6 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += early_ioremap.h generic-y += cputime.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index e6a92455740e..69f1366f1aa3 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -1,7 +1,7 @@ /* * This file is part of the Linux kernel. * - * Copyright (c) 2011, Intel Corporation + * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> * @@ -31,10 +31,13 @@ #define RDRAND_RETRY_LOOPS 10 #define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" #ifdef CONFIG_X86_64 # define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" #else # define RDRAND_LONG RDRAND_INT +# define RDSEED_LONG RDSEED_INT #endif #ifdef CONFIG_ARCH_RANDOM @@ -53,6 +56,16 @@ static inline int rdrand_long(unsigned long *v) return ok; } +/* A single attempt at RDSEED */ +static inline bool rdseed_long(unsigned long *v) +{ + unsigned char ok; + asm volatile(RDSEED_LONG "\n\t" + "setc %0" + : "=qm" (ok), "=a" (*v)); + return ok; +} + #define GET_RANDOM(name, type, rdrand, nop) \ static inline int name(type *v) \ { \ @@ -70,18 +83,40 @@ static inline int name(type *v) \ return ok; \ } +#define GET_SEED(name, type, rdseed, nop) \ +static inline int name(type *v) \ +{ \ + unsigned char ok; \ + alternative_io("movb $0, %0\n\t" \ + nop, \ + rdseed "\n\t" \ + "setc %0", \ + X86_FEATURE_RDSEED, \ + ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ + return ok; \ +} + #ifdef CONFIG_X86_64 GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #else GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); +GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); +GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); + #endif /* CONFIG_X86_64 */ +#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) +#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) + #else static inline int rdrand_long(unsigned long *v) @@ -89,6 +124,11 @@ static inline int rdrand_long(unsigned long *v) return 0; } +static inline bool rdseed_long(unsigned long *v) +{ + return 0; +} + #endif /* CONFIG_ARCH_RANDOM */ extern void x86_init_rdrand(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 2f03ff018d36..ba38ebbaced3 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#ifdef CONFIG_BUG #define HAVE_ARCH_BUG #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -33,8 +32,6 @@ do { \ } while (0) #endif -#endif /* !CONFIG_BUG */ - #include <asm-generic/bug.h> #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8dcd35c4c787..43f482a0db37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -163,5 +163,11 @@ static inline void __set_fixmap(enum fixed_addresses idx, #include <asm-generic/fixmap.h> +#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) +#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0)) + +void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 91d9c69a629e..b8237d8a1e0c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -39,6 +39,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <asm/page.h> +#include <asm/early_ioremap.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -316,19 +317,6 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void early_ioremap_init(void); -extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(resource_size_t phys_addr, - unsigned long size); -extern void __iomem *early_memremap(resource_size_t phys_addr, - unsigned long size); -extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fcaf9c961265..7de069afb382 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -60,7 +60,7 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 94220d14d5cc..851bcdc5db04 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,7 +52,7 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define __this_cpu_ptr(ptr) \ +#define raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ __verify_pcpu_ptr(ptr); \ @@ -362,25 +362,25 @@ do { \ */ #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) - -#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -401,16 +401,16 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) @@ -427,7 +427,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ @@ -436,22 +436,22 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) /* @@ -474,7 +474,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif @@ -495,9 +495,9 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; #ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; #else - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; #endif } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index c8b051933b1b..7024c12f7bfe 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -19,12 +19,12 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { - __this_cpu_write_4(__preempt_count, pc); + raw_cpu_write_4(__preempt_count, pc); } /* @@ -53,17 +53,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -72,12 +72,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - __this_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - __this_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -95,7 +95,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { - return unlikely(!__this_cpu_read_4(__preempt_count)); + return unlikely(!raw_cpu_read_4(__preempt_count)); } #ifdef CONFIG_PREEMPT diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index aea284b41312..d6a756ae04c8 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,7 +13,7 @@ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H -#include <linux/audit.h> +#include <uapi/linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/asm-offsets.h> /* For NR_syscalls */ @@ -91,8 +91,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { return AUDIT_ARCH_I386; } @@ -221,8 +220,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { #ifdef CONFIG_IA32_EMULATION /* @@ -234,7 +232,7 @@ static inline int syscall_get_arch(struct task_struct *task, * * x32 tasks should be considered AUDIT_ARCH_X86_64. */ - if (task_thread_info(task)->status & TS_COMPAT) + if (task_thread_info(current)->status & TS_COMPAT) return AUDIT_ARCH_I386; #endif /* Both x32 and x86_64 are considered "64-bit". */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index e69182fd01cf..4b28159e0421 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; retval = 0; - if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); retval = -1; goto out; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 481ae38f6a44..ad28db7e6bde 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1996,7 +1996,8 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - apic_write(APIC_ESR, 0); + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0641113e2965..a952e9c85b6f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1225,21 +1225,24 @@ static struct notifier_block cacheinfo_cpu_notifier = { static int __init cache_sysfs_init(void) { - int i; + int i, err = 0; if (num_cache_leaves == 0) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - int err; struct device *dev = get_cpu_device(i); err = cache_add_dev(dev); if (err) - return err; + goto out; } - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - return 0; + __register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: + cpu_notifier_register_done(); + return err; } device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4d5419b249da..68317c80de7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +617,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -1278,10 +1282,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); @@ -1296,7 +1308,9 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) { + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { iv = max(iv / 2, (unsigned long) HZ/100); } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -2434,14 +2448,18 @@ static __init int mcheck_init_device(void) if (err) return err; + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); - if (err) + if (err) { + cpu_notifier_register_done(); return err; + } } register_syscore_ops(&mce_syscore_ops); - register_hotcpu_notifier(&mce_cpu_notifier); + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); /* register character device /dev/mcelog */ misc_register(&mce_chrdev_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index fb6156fee6f7..9a316b21df8b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -9,6 +9,7 @@ #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -41,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } } +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_clear(); + cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_POLL_INTERVAL); @@ -194,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -249,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -299,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -343,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76efb..d921b7ee6595 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev) sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(dev, cpu); - mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(dev); - mutex_unlock(&therm_cpu_lock); break; } return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 4b8e4d3cd6ea..4c36bbe3173a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -926,13 +926,13 @@ static __init int amd_ibs_init(void) goto out; perf_ibs_pm_init(); - get_online_cpus(); + cpu_notifier_register_begin(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ smp_mb(); - perf_cpu_notifier(perf_ibs_cpu_notifier); smp_call_function(setup_APIC_ibs, NULL, 1); - put_online_cpus(); + __perf_cpu_notifier(perf_ibs_cpu_notifier); + cpu_notifier_register_done(); ret = perf_event_ibs_init(); out: diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 754291adec33..3bbdf4cd38b9 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -531,15 +531,16 @@ static int __init amd_uncore_init(void) if (ret) return -ENODEV; - get_online_cpus(); + cpu_notifier_register_begin(); + /* init cpus already online before registering for hotplug notifier */ for_each_online_cpu(cpu) { amd_uncore_cpu_up_prepare(cpu); smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); } - register_cpu_notifier(&amd_uncore_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&amd_uncore_cpu_notifier_block); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5ad35ad94d0f..7c87424d4140 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -59,7 +59,7 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ @@ -72,6 +72,12 @@ 1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -425,6 +431,24 @@ static struct attribute *rapl_events_cln_attr[] = { NULL, }; +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -511,6 +535,7 @@ static int rapl_cpu_prepare(int cpu) struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; + u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -518,6 +543,9 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; + if (!rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; @@ -531,8 +559,7 @@ static int rapl_cpu_prepare(int cpu) * * we cache in local PMU instance */ - rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); - pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -631,11 +658,14 @@ static int __init rapl_pmu_init(void) switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; case 45: /* Sandy Bridge-EP */ case 62: /* IvyTown */ rapl_cntr_mask = RAPL_IDX_SRV; @@ -646,19 +676,22 @@ static int __init rapl_pmu_init(void) /* unsupported */ return 0; } - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { - rapl_cpu_prepare(cpu); + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; rapl_cpu_init(cpu); } - perf_cpu_notifier(rapl_cpu_notifier); + __perf_cpu_notifier(rapl_cpu_notifier); ret = perf_pmu_register(&rapl_pmu_class, "power", -1); if (WARN_ON(ret)) { pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); - put_online_cpus(); + cpu_notifier_register_done(); return -1; } @@ -672,7 +705,8 @@ static int __init rapl_pmu_init(void) hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); - put_online_cpus(); +out: + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index bd2253d40cff..65bbbea38b9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -4244,7 +4244,7 @@ static void __init uncore_cpumask_init(void) if (!cpumask_empty(&uncore_cpu_mask)) return; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { int i, phys_id = topology_physical_package_id(cpu); @@ -4263,9 +4263,9 @@ static void __init uncore_cpumask_init(void) } on_each_cpu(uncore_cpu_setup, NULL, 1); - register_cpu_notifier(&uncore_cpu_nb); + __register_cpu_notifier(&uncore_cpu_nb); - put_online_cpus(); + cpu_notifier_register_done(); } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7d9481c743f8..3225ae6c5180 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -198,14 +198,15 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -215,7 +216,7 @@ out_class: for_each_online_cpu(i) { cpuid_device_destroy(i); } - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -227,13 +228,13 @@ static void __exit cpuid_exit(void) { int cpu = 0; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(cpuid_init); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6d7d5a1260a6..6e2537c32190 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -225,7 +225,7 @@ static void __init intel_remapping_check(int num, int slot, int func) * * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func) +static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) { u32 base; @@ -240,10 +240,118 @@ static u32 __init intel_stolen_base(int num, int slot, int func) return base; } -#define KB(x) ((x) * 1024) +#define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) #define GB(x) (MB (KB (x))) +static size_t __init i830_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + if (tmp & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static size_t __init i845_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + switch (tmp & I845_TSEG_SIZE_MASK) { + case I845_TSEG_SIZE_512K: + return KB(512); + case I845_TSEG_SIZE_1M: + return MB(1); + default: + WARN_ON(1); + return 0; + } +} + +static size_t __init i85x_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + /* + * FIXME is the graphics stolen memory region + * always at TOUD? Ie. is it always the last + * one to be allocated by the BIOS? + */ + return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; +} + +static size_t __init i830_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I830_GMCH_GMS_MASK) { + case I830_GMCH_GMS_STOLEN_512: + stolen_size = KB(512); + break; + case I830_GMCH_GMS_STOLEN_1024: + stolen_size = MB(1); + break; + case I830_GMCH_GMS_STOLEN_8192: + stolen_size = MB(8); + break; + case I830_GMCH_GMS_LOCAL: + /* local memory isn't part of the normal address space */ + stolen_size = 0; + break; + default: + return 0; + } + + return stolen_size; +} + static size_t __init gen3_stolen_size(int num, int slot, int func) { size_t stolen_size; @@ -310,7 +418,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -static inline size_t gen8_stolen_size(int num, int slot, int func) +static size_t gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -320,31 +428,74 @@ static inline size_t gen8_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +struct intel_stolen_funcs { + size_t (*size)(int num, int slot, int func); + u32 (*base)(int num, int slot, int func, size_t size); +}; + +static const struct intel_stolen_funcs i830_stolen_funcs = { + .base = i830_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i845_stolen_funcs = { + .base = i845_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i85x_stolen_funcs = { + .base = i85x_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs i865_stolen_funcs = { + .base = i865_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen3_stolen_funcs = { + .base = intel_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen6_stolen_funcs = { + .base = intel_stolen_base, + .size = gen6_stolen_size, +}; + +static const struct intel_stolen_funcs gen8_stolen_funcs = { + .base = intel_stolen_base, + .size = gen8_stolen_size, +}; static struct pci_device_id intel_stolen_ids[] __initdata = { - INTEL_I915G_IDS(gen3_stolen_size), - INTEL_I915GM_IDS(gen3_stolen_size), - INTEL_I945G_IDS(gen3_stolen_size), - INTEL_I945GM_IDS(gen3_stolen_size), - INTEL_VLV_M_IDS(gen6_stolen_size), - INTEL_VLV_D_IDS(gen6_stolen_size), - INTEL_PINEVIEW_IDS(gen3_stolen_size), - INTEL_I965G_IDS(gen3_stolen_size), - INTEL_G33_IDS(gen3_stolen_size), - INTEL_I965GM_IDS(gen3_stolen_size), - INTEL_GM45_IDS(gen3_stolen_size), - INTEL_G45_IDS(gen3_stolen_size), - INTEL_IRONLAKE_D_IDS(gen3_stolen_size), - INTEL_IRONLAKE_M_IDS(gen3_stolen_size), - INTEL_SNB_D_IDS(gen6_stolen_size), - INTEL_SNB_M_IDS(gen6_stolen_size), - INTEL_IVB_M_IDS(gen6_stolen_size), - INTEL_IVB_D_IDS(gen6_stolen_size), - INTEL_HSW_D_IDS(gen6_stolen_size), - INTEL_HSW_M_IDS(gen6_stolen_size), - INTEL_BDW_M_IDS(gen8_stolen_size), - INTEL_BDW_D_IDS(gen8_stolen_size) + INTEL_I830_IDS(&i830_stolen_funcs), + INTEL_I845G_IDS(&i845_stolen_funcs), + INTEL_I85X_IDS(&i85x_stolen_funcs), + INTEL_I865G_IDS(&i865_stolen_funcs), + INTEL_I915G_IDS(&gen3_stolen_funcs), + INTEL_I915GM_IDS(&gen3_stolen_funcs), + INTEL_I945G_IDS(&gen3_stolen_funcs), + INTEL_I945GM_IDS(&gen3_stolen_funcs), + INTEL_VLV_M_IDS(&gen6_stolen_funcs), + INTEL_VLV_D_IDS(&gen6_stolen_funcs), + INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), + INTEL_I965G_IDS(&gen3_stolen_funcs), + INTEL_G33_IDS(&gen3_stolen_funcs), + INTEL_I965GM_IDS(&gen3_stolen_funcs), + INTEL_GM45_IDS(&gen3_stolen_funcs), + INTEL_G45_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), + INTEL_SNB_D_IDS(&gen6_stolen_funcs), + INTEL_SNB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_M_IDS(&gen6_stolen_funcs), + INTEL_BDW_M_IDS(&gen8_stolen_funcs), + INTEL_BDW_D_IDS(&gen8_stolen_funcs) }; static void __init intel_graphics_stolen(int num, int slot, int func) @@ -361,11 +512,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func) for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { if (intel_stolen_ids[i].device == device) { - stolen_size_fn stolen_size = - (stolen_size_fn)intel_stolen_ids[i].driver_data; - size = stolen_size(num, slot, func); - start = intel_stolen_base(num, slot, func); + const struct intel_stolen_funcs *stolen_funcs = + (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; + size = stolen_funcs->size(num, slot, func); + start = stolen_funcs->base(num, slot, func, size); if (size && start) { + printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", + start, start + (u32)size - 1); /* Mark this space as reserved */ e820_add_region(start, size, E820_RESERVED); sanitize_e820_map(e820.map, diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 93eed15a8fd4..8d80ae011603 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -941,12 +941,14 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(hpet_cpuhp_notify, -20); + __hotcpu_notifier(hpet_cpuhp_notify, -20); + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 42805fac0092..283a76a9cc40 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -125,7 +125,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_printf(p, " Machine check polls\n"); #endif -#if defined(CONFIG_HYPERV) || defined(CONFIG_XEN) +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f9682871..61b17dc2c277 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -897,9 +897,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); /* * We are here because the instruction being single * stepped caused a page fault. We reset the current @@ -914,9 +915,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) else reset_current_kprobe(); preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* * We increment the nmissed count for accounting, * we can also use npre/npostfault count for accounting @@ -945,10 +945,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) * fixup routine could not handle it, * Let do_page_fault() fix it. */ - break; - default: - break; } + return 0; } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc987398923..af1d14a9ebda 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + /* + * On x86-64 we do not support 16-bit segments due to + * IRET leaking the high bits of the kernel stack address. + */ +#ifdef CONFIG_X86_64 + if (!ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } +#endif + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 05266b5aae22..c9603ac80de5 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -259,14 +259,15 @@ static int __init msr_init(void) goto out_chrdev; } msr_class->devnode = msr_devnode; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } - register_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __register_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); err = 0; goto out; @@ -275,7 +276,7 @@ out_class: i = 0; for_each_online_cpu(i) msr_device_destroy(i); - put_online_cpus(); + cpu_notifier_register_done(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -286,13 +287,14 @@ out: static void __exit msr_exit(void) { int cpu = 0; - get_online_cpus(); + + cpu_notifier_register_begin(); for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - unregister_hotcpu_notifier(&msr_class_cpu_notifier); - put_online_cpus(); + __unregister_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); } module_init(msr_init); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 299d49302e7d..0497f719977d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1207,23 +1207,31 @@ error: return ret; } -static inline int __init determine_tce_table_size(u64 ram) +static inline int __init determine_tce_table_size(void) { int ret; if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) return specified_table_size; - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order(ram >> 13); - if (ret > TCE_TABLE_SIZE_8M) + if (is_kdump_kernel() && saved_max_pfn) { + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + } else { + /* + * Use 8M by default (suggested by Muli) if it's not + * kdump kernel and saved_max_pfn isn't set. + */ ret = TCE_TABLE_SIZE_8M; + } return ret; } @@ -1418,8 +1426,7 @@ int __init detect_calgary(void) return -ENOMEM; } - specified_table_size = determine_tce_table_size((is_kdump_kernel() ? - saved_max_pfn : max_pfn) * PAGE_SIZE); + specified_table_size = determine_tce_table_size(); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 654b46574b91..3399d3a99730 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -114,8 +114,8 @@ EXPORT_SYMBOL(machine_real_restart); */ static int __init set_pci_reboot(const struct dmi_system_id *d) { - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; pr_info("%s series board detected. Selecting %s-method for reboots.\n", d->ident, "PCI"); } @@ -458,20 +458,23 @@ void __attribute__((weak)) mach_reboot_fixups(void) } /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: * * 1) If the FADT has the ACPI reboot register flag set, try it * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again * 5) If still alive, call the EFI runtime service to reboot - * 6) If still alive, write to the PCI IO port 0xCF9 to reboot - * 7) If still alive, inform BIOS to do a proper reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (7) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. + * + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. */ static void native_machine_emergency_restart(void) { @@ -492,6 +495,11 @@ static void native_machine_emergency_restart(void) for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + case BOOT_KBD: mach_reboot_fixups(); /* For board specific fixups */ @@ -509,43 +517,29 @@ static void native_machine_emergency_restart(void) } break; - case BOOT_TRIPLE: - load_idt(&no_idt); - __asm__ __volatile__("int3"); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_KBD; - break; - - case BOOT_BIOS: - machine_real_restart(MRR_BIOS); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_TRIPLE; - break; - - case BOOT_ACPI: - acpi_reboot(); - reboot_type = BOOT_KBD; - break; - case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; break; - case BOOT_CF9: + case BOOT_CF9_FORCE: port_cf9_safe = true; /* Fall through */ - case BOOT_CF9_COND: + case BOOT_CF9_SAFE: if (port_cf9_safe) { - u8 reboot_code = reboot_mode == REBOOT_WARM ? - 0x06 : 0x0E; + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); @@ -553,7 +547,15 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } - reboot_type = BOOT_BIOS; + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; break; } } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9ea287666c65..8b3b3eb3cead 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -348,9 +348,13 @@ static int __init vsyscall_init(void) { BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + cpu_notifier_register_begin(); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ - hotcpu_notifier(cpu_vsyscall_notifier, 30); + __hotcpu_notifier(cpu_vsyscall_notifier, 30); + + cpu_notifier_register_done(); return 0; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bea60671ef8a..f47a104a749c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -308,7 +308,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX); + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a2a1bb7ed8c1..eeecbed26ac7 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -48,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f5704d9e5ddc..813d31038b93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3601,20 +3601,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3626,12 +3633,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 292615274358..3842e70bdb7c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -73,6 +79,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -110,10 +118,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b1e6c1bf68d3..123efd3ec29f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1320e0f8e611..1f68c5831924 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3484,13 +3484,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d1c55f8722c6..8b8fc0b792ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -652,6 +652,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -680,6 +683,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -1117,7 +1123,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -4164,7 +4169,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5422,7 +5428,8 @@ static void kvm_timer_init(void) int cpu; max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + + cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5439,6 +5446,10 @@ static void kvm_timer_init(void) pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + + __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpu_notifier_register_done(); + } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580cabc78..597ac155c91c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -328,17 +328,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ - early_ioremap_debug = 1; - - return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init; static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -362,18 +351,11 @@ bool __init is_early_ioremap_ptep(pte_t *ptep) return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; - void __init early_ioremap_init(void) { pmd_t *pmd; - int i; - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_init()\n"); - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); @@ -402,13 +384,8 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_reset(void) -{ - after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -425,198 +402,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t prot) -{ - if (after_paging_init) - __set_fixmap(idx, phys, prot); - else - __early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ - if (after_paging_init) - clear_fixmap(idx); - else - __early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i]) { - WARN_ON(1); - break; - } - } - - early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ - int count = 0; - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (prev_map[i]) - count++; - - if (!count) - return 0; - WARN(1, KERN_WARNING - "Debug warning: early ioremap leak of %d areas detected.\n", - count); - printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - - return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ - unsigned long offset; - resource_size_t last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - WARN_ON(system_state != SYSTEM_BOOTING); - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (!prev_map[i]) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", - __func__, (u64)phys_addr, size); - WARN_ON(1); - return NULL; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", - __func__, (u64)phys_addr, size, slot); - dump_stack(); - } - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) { - WARN_ON(1); - return NULL; - } - - prev_size[slot] = size; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) { - WARN_ON(1); - return NULL; - } - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_set_fixmap(idx, phys_addr, prot); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - - prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); - return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i] == addr) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", - addr, size); - WARN_ON(1); - return; - } - - if (prev_size[slot] != size) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot]); - WARN_ON(1); - return; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, slot); - dump_stack(); - } - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { - WARN_ON(1); - return; - } - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_clear_fixmap(idx); - --idx; - --nrpages; - } - prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d6..dd89a13f1051 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init); */ static int __init param_kmemcheck(char *str) { + int val; + int ret; + if (!str) return -EINVAL; - sscanf(str, "%d", &kmemcheck_enabled); + ret = kstrtoint(str, 0, &val); + if (ret) + return ret; + kmemcheck_enabled = val; return 0; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index a69bcb8c7621..4dd8cf652579 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); - fixup_early_ioremap(); + early_ioremap_init(); return 0; } early_param("reservetop", parse_reservetop); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 6890d8498e0b..379e8bd0deea 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -494,14 +494,19 @@ static int nmi_setup(void) if (err) goto fail; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' */ get_online_cpus(); - register_cpu_notifier(&oprofile_cpu_nb); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); on_each_cpu(nmi_cpu_setup, NULL, 1); + __register_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + cpu_notifier_register_done(); + return 0; fail: free_msrs(); @@ -512,12 +517,18 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ get_online_cpus(); - unregister_cpu_notifier(&oprofile_cpu_nb); on_each_cpu(nmi_cpu_shutdown, NULL, 1); nmi_enabled = 0; ctr_running = 0; + __unregister_cpu_notifier(&oprofile_cpu_nb); put_online_cpus(); + + cpu_notifier_register_done(); + /* make variables visible to the nmi handler: */ smp_mb(); unregister_nmi_handler(NMI_LOCAL, "oprofile"); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index a313a7fb6b86..e88f4c53d7f6 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -370,10 +370,13 @@ static int __init pci_io_ecs_init(void) if (early_pci_allowed()) pci_enable_pci_io_ecs(); - register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, (void *)(long)cpu); + __register_cpu_notifier(&amd_cpu_notifier); + cpu_notifier_register_done(); + pci_probe |= PCI_HAS_IO_ECS; return 0; diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index f325af26107c..3323c2745248 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -54,5 +54,7 @@ syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(uapisyshdr-y) $(syshdr-y) +PHONY += all all: $(addprefix $(uapi)/,$(uapisyshdr-y)) all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6de..d6b867921612 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index e8120346903b..604a37efd4d5 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -40,4 +40,6 @@ $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/ina HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-y += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o +PHONY += relocs relocs: $(obj)/relocs + @: diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a18eadd8bb40..7005974c3ff3 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -441,10 +441,11 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4d3acc34a998..0ba5f3b967f0 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 33ca6e42a4ca..fd92a64d748e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -75,6 +75,17 @@ ENDPROC(xen_sysexit) * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + ENTRY(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -83,15 +94,13 @@ ENTRY(xen_iret) push %eax ESP_OFFSET=4 # bytes pushed onto stack - /* - * Store vcpu_info pointer for easy access. Do it this way to - * avoid having to reload %fs - */ + /* Store vcpu_info pointer for easy access */ #ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl %ss:TI_cpu(%eax), %eax - movl %ss:__per_cpu_offset(,%eax,4), %eax - mov %ss:xen_vcpu(%eax), %eax + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS #else movl %ss:xen_vcpu, %eax #endif |