From f0133acc7d4835cfbb86393b7d2a4fba7519585b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 8 May 2016 20:58:40 +0200 Subject: x86/cpu: Correct comments and messages in P4 erratum 037 handling code Remove the linebreak in the conditional and s/errata/erratum/ as the singular is "erratum". No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1462733920-7224-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f71a34944b56..5354080f76c3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -263,15 +263,14 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } /* - * P4 Xeon errata 037 workaround. + * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) - > 0) { + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); - pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); } } -- cgit v1.2.3 From 70b8301f6b8f7bc053377a9cbd0c4e42e29d9807 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 19 May 2016 17:09:55 -0700 Subject: x86/topology: Add topology_max_smt_threads() For SMT specific workarounds it is useful to know if SMT is active on any online CPU in the system. This currently requires a loop over all online CPUs. Add a global variable that is updated with the maximum number of smt threads on any CPU on online/offline, and use it for topology_max_smt_threads() The single call is easier to use than a loop. Not exported to user space because user space already can use the existing sibling interfaces to find this out. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1463703002-19686-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 9 +++++++++ arch/x86/kernel/smpboot.c | 25 ++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 7f991bd5031b..e346572841a0 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -129,6 +129,14 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) + +extern int __max_smt_threads; + +static inline int topology_max_smt_threads(void) +{ + return __max_smt_threads; +} + int topology_update_package_map(unsigned int apicid, unsigned int cpu); extern int topology_phys_to_logical_pkg(unsigned int pkg); #else @@ -136,6 +144,7 @@ extern int topology_phys_to_logical_pkg(unsigned int pkg); static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_max_smt_threads(void) { return 1; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923cac..2ed0ec1353f8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -105,6 +105,9 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +/* Maximum number of SMT threads on any online core */ +int __max_smt_threads __read_mostly; + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -493,7 +496,7 @@ void set_cpu_sibling_map(int cpu) bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; - int i; + int i, threads; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); @@ -550,6 +553,10 @@ void set_cpu_sibling_map(int cpu) if (match_die(c, o) && !topology_same_node(c, o)) primarily_use_numa_for_topology(); } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -1441,6 +1448,21 @@ __init void prefill_possible_map(void) #ifdef CONFIG_HOTPLUG_CPU +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + static void remove_siblinginfo(int cpu) { int sibling; @@ -1465,6 +1487,7 @@ static void remove_siblinginfo(int cpu) c->phys_proc_id = 0; c->cpu_core_id = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); } static void remove_cpu_from_maps(int cpu) -- cgit v1.2.3 From d1898b733619bd46194bd25aa6452d238ff2dc4e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 1 Jun 2016 10:42:20 -0700 Subject: x86/fpu: Add tracepoints to dump FPU state at key points I've been carrying this patch around for a bit and it's helped me solve at least a couple FPU-related bugs. In addition to using it for debugging, I also drug it out because using AVX (and AVX2/AVX-512) can have serious power consequences for a modern core. It's very important to be able to figure out who is using it. It's also insanely useful to go out and see who is using a given feature, like MPX or Memory Protection Keys. If you, for instance, want to find all processes using protection keys, you can do: echo 'xfeatures & 0x200' > filter Since 0x200 is the protection keys feature bit. Note that this touches the KVM code. KVM did a CREATE_TRACE_POINTS and then included a bunch of random headers. If anyone one of those included other tracepoints, it would have defined the *OTHER* tracepoints. That's bogus, so move it to the right place. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160601174220.3CDFB90E@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 5 ++ arch/x86/include/asm/trace/fpu.h | 119 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 18 ++++++ arch/x86/kernel/fpu/signal.c | 3 + arch/x86/kvm/x86.c | 6 +- 5 files changed, 148 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/trace/fpu.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 31ac8e6d9f36..116b58347501 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -18,6 +18,7 @@ #include #include #include +#include /* * High level FPU state handling functions: @@ -524,6 +525,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); } /* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ @@ -533,6 +535,7 @@ static inline void __fpregs_activate(struct fpu *fpu) fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); } /* @@ -604,11 +607,13 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; + trace_x86_fpu_regs_deactivated(old_fpu); /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { new_fpu->counter++; __fpregs_activate(new_fpu); + trace_x86_fpu_regs_activated(new_fpu); prefetch(&new_fpu->state); } else { __fpregs_deactivate_hw(); diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h new file mode 100644 index 000000000000..9217ab1f5bf6 --- /dev/null +++ b/arch/x86/include/asm/trace/fpu.h @@ -0,0 +1,119 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM x86_fpu + +#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FPU_H + +#include + +DECLARE_EVENT_CLASS(x86_fpu, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu), + + TP_STRUCT__entry( + __field(struct fpu *, fpu) + __field(bool, fpregs_active) + __field(bool, fpstate_active) + __field(int, counter) + __field(u64, xfeatures) + __field(u64, xcomp_bv) + ), + + TP_fast_assign( + __entry->fpu = fpu; + __entry->fpregs_active = fpu->fpregs_active; + __entry->fpstate_active = fpu->fpstate_active; + __entry->counter = fpu->counter; + if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { + __entry->xfeatures = fpu->state.xsave.header.xfeatures; + __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; + } + ), + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + __entry->fpu, + __entry->fpregs_active, + __entry->fpstate_active, + __entry->counter, + __entry->xfeatures, + __entry->xcomp_bv + ) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_before_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_after_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_init_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_dropped, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +); + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE fpu +#endif /* _TRACE_FPU_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97027545a72d..7d564742e499 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -12,6 +12,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: @@ -192,6 +195,7 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { if (use_eager_fpu()) @@ -200,6 +204,7 @@ void fpu__save(struct fpu *fpu) fpregs_deactivate(fpu); } } + trace_x86_fpu_after_save(fpu); preempt_enable(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -275,6 +280,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } preempt_enable(); + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + return 0; } @@ -288,7 +296,9 @@ void fpu__activate_curr(struct fpu *fpu) if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; } @@ -314,7 +324,9 @@ void fpu__activate_fpstate_read(struct fpu *fpu) } else { if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -347,7 +359,9 @@ void fpu__activate_fpstate_write(struct fpu *fpu) fpu->last_cpu = -1; } else { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -432,9 +446,11 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); + trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); fpu->counter++; + trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -463,6 +479,8 @@ void fpu__drop(struct fpu *fpu) fpu->fpstate_active = 0; + trace_x86_fpu_dropped(fpu); + preempt_enable(); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31c6a60505e6..c6f2a3cee2c2 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -10,6 +10,7 @@ #include #include +#include static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; @@ -282,6 +283,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ state_size = sizeof(struct fxregs_state); fx_only = 1; + trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; xfeatures = fx_sw_user.xfeatures; @@ -311,6 +313,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902d9da12392..1ba3b7d3cae9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,9 +55,6 @@ #include #include -#define CREATE_TRACE_POINTS -#include "trace.h" - #include #include #include @@ -68,6 +65,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" + #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) -- cgit v1.2.3 From f5967101e9de12addcda4510dfbac66d7c5779c3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2016 12:56:27 +0200 Subject: x86/hweight: Get rid of the special calling convention People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench into kcov, lto, etc, experimentations. Add asm versions for __sw_hweight{32,64}() and do explicit saving and restoring of clobbered registers. This gets rid of the special calling convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs. We still need to hardcode POPCNT and register operands as some old gas versions which we support, do not know about POPCNT. Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives can do padding now. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 --- arch/x86/include/asm/arch_hweight.h | 24 +++++------- arch/x86/kernel/i386_ksyms_32.c | 2 + arch/x86/kernel/x8664_ksyms_64.c | 3 ++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/hweight.S | 77 +++++++++++++++++++++++++++++++++++++ lib/Makefile | 5 --- lib/hweight.c | 4 ++ 8 files changed, 97 insertions(+), 25 deletions(-) create mode 100644 arch/x86/lib/hweight.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a7b885964ba..729d41d9ced3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -294,11 +294,6 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR -config ARCH_HWEIGHT_CFLAGS - string - default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 - default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 02e799fa43d1..e7cd63175de4 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -4,8 +4,8 @@ #include #ifdef CONFIG_64BIT -/* popcnt %edi, %eax -- redundant REX prefix for alignment */ -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" +/* popcnt %edi, %eax */ +#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7" /* popcnt %rdi, %rax */ #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7" #define REG_IN "D" @@ -17,19 +17,15 @@ #define REG_OUT "a" #endif -/* - * __sw_hweightXX are called from within the alternatives below - * and callee-clobbered registers need to be taken care of. See - * ARCH_HWEIGHT_CFLAGS in for the respective - * compiler switches. - */ +#define __HAVE_ARCH_SW_HWEIGHT + static __always_inline unsigned int __arch_hweight32(unsigned int w) { - unsigned int res = 0; + unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } @@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w) #else static __always_inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; + unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa485ae..d40ee8a38fed 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd05942bc918..f1aebfb49c36 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 72a576752a7e..ec969cc3eb20 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S new file mode 100644 index 000000000000..02de3d74d2c5 --- /dev/null +++ b/arch/x86/lib/hweight.S @@ -0,0 +1,77 @@ +#include + +#include + +/* + * unsigned int __sw_hweight32(unsigned int w) + * %rdi: w + */ +ENTRY(__sw_hweight32) + +#ifdef CONFIG_X86_64 + movl %edi, %eax # w +#endif + __ASM_SIZE(push,) %__ASM_REG(dx) + movl %eax, %edx # w -> t + shrl %edx # t >>= 1 + andl $0x55555555, %edx # t &= 0x55555555 + subl %edx, %eax # w -= t + + movl %eax, %edx # w -> t + shrl $2, %eax # w_tmp >>= 2 + andl $0x33333333, %edx # t &= 0x33333333 + andl $0x33333333, %eax # w_tmp &= 0x33333333 + addl %edx, %eax # w = w_tmp + t + + movl %eax, %edx # w -> t + shrl $4, %edx # t >>= 4 + addl %edx, %eax # w_tmp += t + andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f + imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 + shrl $24, %eax # w = w_tmp >> 24 + __ASM_SIZE(pop,) %__ASM_REG(dx) + ret +ENDPROC(__sw_hweight32) + +ENTRY(__sw_hweight64) +#ifdef CONFIG_X86_64 + pushq %rdx + + movq %rdi, %rdx # w -> t + movabsq $0x5555555555555555, %rax + shrq %rdx # t >>= 1 + andq %rdx, %rax # t &= 0x5555555555555555 + movabsq $0x3333333333333333, %rdx + subq %rax, %rdi # w -= t + + movq %rdi, %rax # w -> t + shrq $2, %rdi # w_tmp >>= 2 + andq %rdx, %rax # t &= 0x3333333333333333 + andq %rdi, %rdx # w_tmp &= 0x3333333333333333 + addq %rdx, %rax # w = w_tmp + t + + movq %rax, %rdx # w -> t + shrq $4, %rdx # t >>= 4 + addq %rdx, %rax # w_tmp += t + movabsq $0x0f0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f + movabsq $0x0101010101010101, %rdx + imulq %rdx, %rax # w_tmp *= 0x0101010101010101 + shrq $56, %rax # w = w_tmp >> 56 + + popq %rdx + ret +#else /* CONFIG_X86_32 */ + /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ + pushl %ecx + + call __sw_hweight32 + movl %eax, %ecx # stash away result + movl %edx, %eax # second part of input + call __sw_hweight32 + addl %ecx, %eax # result + + popl %ecx + ret +#endif +ENDPROC(__sw_hweight64) diff --git a/lib/Makefile b/lib/Makefile index ff6a7a6c6395..07d06a8b9788 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n -# Kernel does not boot if we instrument this file as it uses custom calling -# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). -KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ @@ -74,8 +71,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221558..43273a7d83cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif -- cgit v1.2.3 From 2823d4da5d8a0c222747b24eceb65f5b30717d02 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:37 -0700 Subject: x86, bitops: remove use of "sbb" to return CF Use SETC instead of SBB to return the value of CF from assembly. Using SETcc enables uniformity with other flags-returning pieces of assembly code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-2-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) --- arch/x86/include/asm/bitops.h | 24 ++++++++++++------------ arch/x86/include/asm/percpu.h | 12 ++++++------ arch/x86/include/asm/signal.h | 6 +++--- arch/x86/include/asm/sync_bitops.h | 18 +++++++++--------- arch/x86/kernel/vm86_32.c | 5 +---- 5 files changed, 31 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 7766d1cf096e..b2b797d1f49a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -230,11 +230,11 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) */ static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm("bts %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -270,11 +270,11 @@ static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *a */ static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("btr %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -282,11 +282,11 @@ static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long /* WARNING: non atomic and it can be reordered! */ static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("btc %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr) : "memory"); return oldbit; @@ -313,11 +313,11 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("bt %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + "setc %0" + : "=qm" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e0ba66ca68c6..65039e9571db 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -510,9 +510,9 @@ do { \ /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ - int old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (var) \ + unsigned char old__; \ + asm volatile("btr %2,"__percpu_arg(1)"\n\tsetc %0" \ + : "=qm" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) @@ -532,11 +532,11 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, static inline int x86_this_cpu_variable_test_bit(int nr, const unsigned long __percpu *addr) { - int oldbit; + unsigned char oldbit; asm volatile("bt "__percpu_arg(2)",%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + "setc %0" + : "=qm" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2138c9ae19ee..dd1e7d6387ab 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -81,9 +81,9 @@ static inline int __const_sigismember(sigset_t *set, int _sig) static inline int __gen_sigismember(sigset_t *set, int _sig) { - int ret; - asm("btl %2,%1\n\tsbbl %0,%0" - : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); + unsigned char ret; + asm("btl %2,%1\n\tsetc %0" + : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); return ret; } diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index f28a24b51dc7..cbf8847d02a0 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -79,10 +79,10 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; bts %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -97,10 +97,10 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btr %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -115,10 +115,10 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btc %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 3dce1ca0a653..01f30e56f99e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) static inline int is_revectored(int nr, struct revectored_struct *bitmap) { - __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" - :"=r" (nr) - :"m" (*bitmap), "r" (nr)); - return nr; + return test_bit(nr, bitmap->__map); } #define val_byte(val, n) (((__u8 *)&val)[n]) -- cgit v1.2.3 From 3b290398638ee4e57f1fb2e35c02005cba9a737f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:46 -0700 Subject: x86, asm: Use CC_SET()/CC_OUT() and static_cpu_has() in archrandom.h Use CC_SET()/CC_OUT() and static_cpu_has(). This produces code good enough to eliminate ad hoc use of alternatives in , greatly simplifying the code. While we are at it, make x86_init_rdrand() compile out completely if we don't need it. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-11-git-send-email-hpa@linux.intel.com v2: fix a conflict between and discovered by Ingo Molnar. There are a few places in x86-specific code where we need all of even when CONFIG_ARCH_RANDOM is disabled, so does not suffice. --- arch/x86/include/asm/archrandom.h | 128 ++++++++++++++++++-------------------- arch/x86/kernel/cpu/rdrand.c | 4 +- 2 files changed, 62 insertions(+), 70 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ab6f599ce2fd..5b0579abb398 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -25,8 +25,6 @@ #include #include -#include -#include #define RDRAND_RETRY_LOOPS 10 @@ -40,97 +38,91 @@ # define RDSEED_LONG RDSEED_INT #endif -#ifdef CONFIG_ARCH_RANDOM +/* Unconditional execution of RDRAND and RDSEED */ -/* Instead of arch_get_random_long() when alternatives haven't run. */ static inline bool rdrand_long(unsigned long *v) { - int ok; - asm volatile("1: " RDRAND_LONG "\n\t" - "jc 2f\n\t" - "decl %0\n\t" - "jnz 1b\n\t" - "2:" - : "=r" (ok), "=a" (*v) - : "0" (RDRAND_RETRY_LOOPS)); - return !!ok; + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_LONG "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; +} + +static inline bool rdrand_int(unsigned int *v) +{ + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; } -/* A single attempt at RDSEED */ static inline bool rdseed_long(unsigned long *v) { bool ok; asm volatile(RDSEED_LONG "\n\t" - "setc %0" - : "=qm" (ok), "=a" (*v)); + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); return ok; } -#define GET_RANDOM(name, type, rdrand, nop) \ -static inline bool name(type *v) \ -{ \ - int ok; \ - alternative_io("movl $0, %0\n\t" \ - nop, \ - "\n1: " rdrand "\n\t" \ - "jc 2f\n\t" \ - "decl %0\n\t" \ - "jnz 1b\n\t" \ - "2:", \ - X86_FEATURE_RDRAND, \ - ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ - "0" (RDRAND_RETRY_LOOPS)); \ - return !!ok; \ -} - -#define GET_SEED(name, type, rdseed, nop) \ -static inline bool name(type *v) \ -{ \ - bool ok; \ - alternative_io("movb $0, %0\n\t" \ - nop, \ - rdseed "\n\t" \ - "setc %0", \ - X86_FEATURE_RDSEED, \ - ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ - return ok; \ +static inline bool rdseed_int(unsigned int *v) +{ + bool ok; + asm volatile(RDSEED_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; } -#ifdef CONFIG_X86_64 - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#else - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#endif /* CONFIG_X86_64 */ - +/* Conditional execution based on CPU type */ #define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) #define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) -#else +/* + * These are the generic interfaces; they must not be declared if the + * stubs in are to be invoked, + * i.e. CONFIG_ARCH_RANDOM is not defined. + */ +#ifdef CONFIG_ARCH_RANDOM -static inline bool rdrand_long(unsigned long *v) +static inline bool arch_get_random_long(unsigned long *v) { - return 0; + return arch_has_random() ? rdrand_long(v) : false; } -static inline bool rdseed_long(unsigned long *v) +static inline bool arch_get_random_int(unsigned int *v) { - return 0; + return arch_has_random() ? rdrand_int(v) : false; } -#endif /* CONFIG_ARCH_RANDOM */ +static inline bool arch_get_random_seed_long(unsigned long *v) +{ + return arch_has_random_seed() ? rdseed_long(v) : false; +} + +static inline bool arch_get_random_seed_int(unsigned int *v) +{ + return arch_has_random_seed() ? rdseed_int(v) : false; +} extern void x86_init_rdrand(struct cpuinfo_x86 *c); +#else /* !CONFIG_ARCH_RANDOM */ + +static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } + +#endif /* !CONFIG_ARCH_RANDOM */ + #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index f6f50c4ceaec..cfa97ff67bda 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 +#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { -#ifdef CONFIG_ARCH_RANDOM unsigned long tmp; int i; @@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } -#endif } +#endif -- cgit v1.2.3 From 4855531eb8582a98cb905d2baf86021254d7a675 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 8 Jun 2016 14:59:53 +0800 Subject: x86/ioapic: Simplify ioapic_setup_resources() Optimize the function by removing the variable 'num'. Signed-off-by: Rui Wang Signed-off-by: Thomas Gleixner Cc: tony.luck@intel.com Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: bhelgaas@google.com Link: http://lkml.kernel.org/r/1465369193-4816-4-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 446702ed99dc..e58729597a7a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2567,29 +2567,25 @@ static struct resource * __init ioapic_setup_resources(void) unsigned long n; struct resource *res; char *mem; - int i, num = 0; + int i; - for_each_ioapic(i) - num++; - if (num == 0) + if (nr_ioapics == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= num; + n *= nr_ioapics; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * num; + mem += sizeof(struct resource) * nr_ioapics; - num = 0; for_each_ioapic(i) { - res[num].name = mem; - res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; - ioapics[i].iomem_res = &res[num]; - num++; + ioapics[i].iomem_res = &res[i]; } ioapic_resources = res; -- cgit v1.2.3 From 3c8fad9183ab7b3b3471fd2bb3d604104dd447cb Mon Sep 17 00:00:00 2001 From: Claudio Fontana Date: Thu, 9 Jun 2016 12:31:58 +0200 Subject: x86/apic: Fix misspelled APIC Signed-off-by: Claudio Fontana Signed-off-by: Thomas Gleixner Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1465468318-19867-1-git-send-email-hw.claudio@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 60078a67d7e3..f943d2f453a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2045,7 +2045,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus - 1; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i almost" + "APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2057,7 +2057,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i reached." + "APIC: NR_CPUS/possible_cpus limit of %i reached." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; @@ -2085,7 +2085,7 @@ int generic_processor_info(int apicid, int version) if (topology_update_package_map(apicid, cpu) < 0) { int thiscpu = max + disabled_cpus; - pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENOSPC; -- cgit v1.2.3 From b2de43605410d1970dc9e0f349e399f1d561be13 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 27 May 2016 14:11:06 -0700 Subject: x86/mce: Do not use bank 1 for APEI generated error logs BIOS can report a memory error to Linux using ACPI/APEI mechanism. When it does this, we create a fictitious machine check error record and feed it into the standard mce_log() function. The error record needs a machine check bank number, and for some reason we chose "1" for this. But "1" is a valid bank number, and this causes confusion and heartburn among h/w folks who are concerned that a memory error signature was somehow logged in bank 1. Change to use "-1" (field is a "u8" so will typically print as 255). This should make it clearer that this error did not originate in a machine check bank. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aristeu Rozanski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/b7fffb2b326bc1dd150ffceb9919a803f9496e0e.1464805958.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 34c89a3e8260..83f1a98d37db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) return; mce_setup(&m); - m.bank = 1; + m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; -- cgit v1.2.3 From 2348140d58f4f4245e9635ea8f1a77e940a4d877 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:44 +0800 Subject: KVM: Fix steal clock warp during guest CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes, after CPU hotplug you can observe a spike in stolen time (100%) followed by the CPU being marked as 100% idle when it's actually busy with a CPU hog task. The trace looks like the following: cpuhp/1-12 [001] d.h1 167.461657: account_process_tick: steal = 1291385514, prev_steal_time = 0 cpuhp/1-12 [001] d.h1 167.461659: account_process_tick: steal_jiffies = 1291 -0 [001] d.h1 167.462663: account_process_tick: steal = 18732255, prev_steal_time = 1291000000 -0 [001] d.h1 167.462664: account_process_tick: steal_jiffies = 18446744072437 The sudden decrease of "steal" causes steal_jiffies to underflow. The root cause is kvm_steal_time being reset to 0 after hot-plugging back in a CPU. Instead, the preexisting value can be used, which is what the core scheduler code expects. John Stultz also reported a similar issue after guest S3. Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: John Stultz Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465813966-3116-2-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..1ef5e48b3a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); -- cgit v1.2.3 From 2c95afc1e83d93fac3be6923465e1753c2c53b0a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jun 2016 06:14:38 -0700 Subject: perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 The NMI watchdog uses either the fixed cycles or a generic cycles counter. This causes a lot of conflicts with users of the PMU who want to run a full group including the cycles fixed counter, for example the --topdown support recently added to perf stat. The code needs to fall back to not use groups, which can cause measurement inaccuracy due to multiplexing errors. This patch switches the NMI watchdog to use reference cycles on Intel systems. This is actually more accurate than cycles, because cycles can tick faster than the measured CPU Frequency due to Turbo mode. The ref cycles always tick at their frequency, or slower when the system is idling. That means the NMI watchdog can never expire too early, unlike with cycles. The reference cycles tick roughly at the frequency of the TSC, so the same period computation can be used. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1465478079-19993-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 ++++++++ include/linux/nmi.h | 1 + kernel/watchdog.c | 7 +++++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643bf4..016f4263fad4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,8 +18,16 @@ #include #include #include +#include #ifdef CONFIG_HARDLOCKUP_DETECTOR +int hw_nmi_get_event(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return PERF_COUNT_HW_REF_CPU_CYCLES; + return PERF_COUNT_HW_CPU_CYCLES; +} + u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4630eeae18e0..79858af27209 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,6 +66,7 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); +int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..8dd30fcd91be 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,6 +315,12 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* Can be overriden by architecture */ +__weak int hw_nmi_get_event(void) +{ + return PERF_COUNT_HW_CPU_CYCLES; +} + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -604,6 +610,7 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.2.3 From a4455082dc6f0b5d51a23523f77600e8ede47c79 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 8 Jun 2016 10:25:33 -0700 Subject: x86/signals: Add missing signal_compat code for x86 features The 32-bit siginfo is a different binary format than the 64-bit one. So, when running 32-bit binaries on 64-bit kernels, we have to convert the kernel's 64-bit version to a 32-bit version that userspace can grok. We've added a few features to siginfo over the past few years and neglected to add them to arch/x86/kernel/signal_compat.c: 1. The si_addr_lsb used in SIGBUS's sent for machine checks 2. The upper/lower bounds for MPX SIGSEGV faults 3. The protection key for pkey faults I caught this with some protection keys unit tests and realized it affected a few more features. This was tested only with my protection keys patch that looks for a proper value in si_pkey. I didn't actually test the machine check or MPX code. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/20160608172533.F8F05637@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 11 +++++++++++ arch/x86/kernel/signal_compat.c | 15 +++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5a3b2c119ed0..a18806165fe4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -40,6 +40,7 @@ typedef s32 compat_long_t; typedef s64 __attribute__((aligned(4))) compat_s64; typedef u32 compat_uint_t; typedef u32 compat_ulong_t; +typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; @@ -181,6 +182,16 @@ typedef struct compat_siginfo { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ struct { unsigned int _addr; /* faulting insn/memory ref. */ + short int _addr_lsb; /* Valid LSB of the reported address. */ + union { + /* used when si_code=SEGV_BNDERR */ + struct { + compat_uptr_t _lower; + compat_uptr_t _upper; + } _addr_bnd; + /* used when si_code=SEGV_PKUERR */ + compat_u32 _pkey; + }; } _sigfault; /* SIGPOLL */ diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index dc3c0b1c816f..5335ad96a290 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -32,6 +32,21 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || + from->si_code == BUS_MCEERR_AO)) + put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); + + if (from->si_signo == SIGSEGV) { + if (from->si_code == SEGV_BNDERR) { + compat_uptr_t lower = (unsigned long)&to->si_lower; + compat_uptr_t upper = (unsigned long)&to->si_upper; + put_user_ex(lower, &to->si_lower); + put_user_ex(upper, &to->si_upper); + } + if (from->si_code == SEGV_PKUERR) + put_user_ex(from->si_pkey, &to->si_pkey); + } break; case __SI_SYS >> 16: put_user_ex(from->si_syscall, &to->si_syscall); -- cgit v1.2.3 From 02e8fda2cc00419a11cf38199afea4c0d7172be8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 8 Jun 2016 10:25:34 -0700 Subject: x86/signals: Add build-time checks to the siginfo compat code There were at least 3 features added to the __SI_FAULT area of the siginfo struct that did not make it to the compat siginfo: 1. The si_addr_lsb used in SIGBUS's sent for machine checks 2. The upper/lower bounds for MPX SIGSEGV faults 3. The protection key for pkey faults There was also some turmoil when I was attempting to add the pkey field because it needs to be a fixed size on 32 and 64-bit and not have any alignment constraints. This patch adds some compile-time checks to the compat code to make it harder to screw this up. Basically, the checks are supposed to trip any time someone changes the siginfo structure. That sounds bad, but it's what we want. If someone changes siginfo, we want them to also be _forced_ to go look at the compat code. The details are in the comments. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/20160608172534.C73DAFC3@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_compat.c | 93 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 5335ad96a290..b44564bf86a8 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,11 +1,104 @@ #include #include +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 8); + BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + /* any new si_fields should be added here */ +} + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) { int err = 0; bool ia32 = test_thread_flag(TIF_IA32); + signal_compat_build_tests(); + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; -- cgit v1.2.3 From a1141e0b5ca6ee3e5e35d5f1a310a5ecb9c96ce5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 20 May 2016 10:47:05 -0700 Subject: x86/fpu/xstate: Define and use 'fpu_user_xstate_size' The kernel xstate area can be in standard or compacted format; it is always in standard format for user mode. When XSAVES is enabled, the kernel uses the compacted format and it is necessary to use a separate fpu_user_xstate_size for signal/ptrace frames. Signed-off-by: Fenghua Yu [ Rebased the patch and cleaned up the naming. ] Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8756ec34dabddfc727cda5743195eb81e8caf91c.1463760376.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/fpu/init.c | 5 ++- arch/x86/kernel/fpu/signal.c | 27 ++++++++++---- arch/x86/kernel/fpu/xstate.c | 76 ++++++++++++++++++++++++--------------- 5 files changed, 73 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 38951b0fcc5a..16df2c44ac66 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -39,7 +39,6 @@ #define REX_PREFIX #endif -extern unsigned int xstate_size; extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 62c6cc3cc5d3..0a16a16284f5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -368,6 +368,7 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; +extern unsigned int fpu_user_xstate_size; struct perf_event; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index aacfd7a82cec..5b1928c0aad4 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void) } /* - * Set up the xstate_size based on the legacy FPU context size. + * Set up the user and kernel xstate_size based on the legacy FPU context size. * * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. @@ -226,6 +226,9 @@ static void __init fpu__init_system_xstate_size_legacy(void) else xstate_size = sizeof(struct fregs_state); } + + fpu_user_xstate_size = xstate_size; + /* * Quirk: we don't yet handle the XSAVES* instructions * correctly, as we don't correctly convert between diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c6f2a3cee2c2..0d29d4de4209 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -32,7 +32,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fpu_user_xstate_size || fx_sw->xstate_size > fx_sw->extended_size) return -1; @@ -89,7 +89,8 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) if (!use_xsave()) return err; - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -126,7 +127,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) else err = copy_fregs_to_user((struct fregs_state __user *) buf); - if (unlikely(err) && __clear_user(buf, xstate_size)) + if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) err = -EFAULT; return err; } @@ -176,8 +177,19 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) copy_fxregs_to_kernel(&tsk->thread.fpu); } else { + /* + * It is a *bug* if kernel uses compacted-format for xsave + * area and we copy it out directly to a signal frame. It + * should have been handled above by saving the registers + * directly. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); + return -1; + } + fpstate_sanitize_xstate(&tsk->thread.fpu); - if (__copy_to_user(buf_fx, xsave, xstate_size)) + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -344,7 +356,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) static inline int xstate_sigframe_size(void) { - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; + return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : + fpu_user_xstate_size; } /* @@ -388,12 +401,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; + fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (config_enabled(CONFIG_IA32_EMULATION) || config_enabled(CONFIG_X86_32)) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4ea2a59483c7..9c4da358ebb9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -43,6 +43,13 @@ static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +/* + * The XSAVE area of kernel can be in standard or compacted format; + * it is always in standard format for user mode. This is the user + * mode standard format size used for signal and ptrace frames. + */ +unsigned int fpu_user_xstate_size; + /* * Clear all of the X86_FEATURE_* bits that are unavailable * when the CPU has no XSAVE support. @@ -171,7 +178,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ while (xfeatures) { if (xfeatures & 0x1) { - int offset = xstate_offsets[feature_bit]; + int offset = xstate_comp_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, @@ -533,8 +540,9 @@ static void do_extra_xstate_size_checks(void) XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); } + /* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0/xfeatures_mask. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -544,34 +552,33 @@ static void do_extra_xstate_size_checks(void) * Note that we do not currently set any bits on IA32_XSS so * 'XCR0 | IA32_XSS == XCR0' for now. */ -static unsigned int __init calculate_xstate_size(void) +static unsigned int __init get_xsaves_size(void) { unsigned int eax, ebx, ecx, edx; - unsigned int calculated_xstate_size; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - /* - * - CPUID function 0DH, sub-function 0: - * EBX enumerates the size (in bytes) required by - * the XSAVE instruction for an XSAVE area - * containing all the *user* state components - * corresponding to bits currently set in XCR0. - */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } else { - /* - * - CPUID function 0DH, sub-function 1: - * EBX enumerates the size (in bytes) required by - * the XSAVES instruction for an XSAVE area - * containing all the state components - * corresponding to bits currently set in - * XCR0 | IA32_XSS. - */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } - return calculated_xstate_size; +static unsigned int __init get_xsave_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; } /* @@ -591,7 +598,15 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size) static int init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size = calculate_xstate_size(); + unsigned int possible_xstate_size; + unsigned int xsave_size; + + xsave_size = get_xsave_size(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + possible_xstate_size = get_xsaves_size(); + else + possible_xstate_size = xsave_size; /* Ensure we have the space to store all enabled: */ if (!is_supported_xstate_size(possible_xstate_size)) @@ -603,6 +618,11 @@ static int init_xstate_size(void) */ xstate_size = possible_xstate_size; do_extra_xstate_size_checks(); + + /* + * User space is always in standard format. + */ + fpu_user_xstate_size = xsave_size; return 0; } -- cgit v1.2.3 From bf15a8cf8d14879b785c548728415d36ccb6a33b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 20 May 2016 10:47:06 -0700 Subject: x86/fpu/xstate: Rename 'xstate_size' to 'fpu_kernel_xstate_size', to distinguish it from 'fpu_user_xstate_size' User space uses standard format xsave area. fpstate in signal frame should have standard format size. To explicitly distinguish between xstate size in kernel space and the one in user space, we rename 'xstate_size' to 'fpu_kernel_xstate_size'. Cleanup only, no change in functionality. Signed-off-by: Fenghua Yu [ Rebased the patch and cleaned up the naming. ] Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2ecbae347a5152d94be52adf7d0f3b7305d90d99.1463760376.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/fpu/core.c | 7 ++++--- arch/x86/kernel/fpu/init.c | 20 +++++++++++--------- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 8 ++++---- 5 files changed, 21 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0a16a16284f5..965c5d212c31 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,7 +367,7 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ -extern unsigned int xstate_size; +extern unsigned int fpu_kernel_xstate_size; extern unsigned int fpu_user_xstate_size; struct perf_event; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7d564742e499..c759bd01ec99 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -227,7 +227,7 @@ void fpstate_init(union fpregs_state *state) return; } - memset(state, 0, xstate_size); + memset(state, 0, fpu_kernel_xstate_size); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); @@ -252,7 +252,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * leak into the child task: */ if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -271,7 +271,8 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) */ preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + memcpy(&src_fpu->state, &dst_fpu->state, + fpu_kernel_xstate_size); if (use_eager_fpu()) copy_kernel_to_fpregs(&src_fpu->state); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 5b1928c0aad4..60f3839c5bfa 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -145,8 +145,8 @@ static void __init fpu__init_system_generic(void) * This is inherent to the XSAVE architecture which puts all state * components into a single, continuous memory block: */ -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); +unsigned int fpu_kernel_xstate_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -178,7 +178,7 @@ static void __init fpu__init_task_struct_size(void) * Add back the dynamically-calculated register state * size. */ - task_size += xstate_size; + task_size += fpu_kernel_xstate_size; /* * We dynamically size 'struct fpu', so we require that @@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void) } /* - * Set up the user and kernel xstate_size based on the legacy FPU context size. + * Set up the user and kernel xstate sizes based on the legacy FPU context size. * * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. @@ -208,7 +208,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) on_boot_cpu = 0; /* - * Note that xstate_size might be overwriten later during + * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). */ @@ -219,15 +219,17 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct swregs_state); + fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) - xstate_size = sizeof(struct fxregs_state); + fpu_kernel_xstate_size = + sizeof(struct fxregs_state); else - xstate_size = sizeof(struct fregs_state); + fpu_kernel_xstate_size = + sizeof(struct fregs_state); } - fpu_user_xstate_size = xstate_size; + fpu_user_xstate_size = fpu_kernel_xstate_size; /* * Quirk: we don't yet handle the XSAVES* instructions diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 0d29d4de4209..06d80f62c03f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -263,7 +263,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; + int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9c4da358ebb9..46abfafe61c8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -537,7 +537,7 @@ static void do_extra_xstate_size_checks(void) */ paranoid_xstate_size += xfeature_size(i); } - XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); + XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); } @@ -616,7 +616,7 @@ static int init_xstate_size(void) * The size is OK, we are definitely going to use xsave, * make it known to the world that we need more space. */ - xstate_size = possible_xstate_size; + fpu_kernel_xstate_size = possible_xstate_size; do_extra_xstate_size_checks(); /* @@ -679,14 +679,14 @@ void __init fpu__init_system_xstate(void) return; } - update_regset_xstate_info(xstate_size, xfeatures_mask); + update_regset_xstate_info(fpu_kernel_xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, - xstate_size, + fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); } -- cgit v1.2.3 From 7d9370607d28afd454775c623d5447603473a3c3 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 20 May 2016 10:47:07 -0700 Subject: x86/fpu/xstate: Keep init_fpstate.xsave.header.xfeatures as zero for init optimization Keep init_fpstate.xsave.header.xfeatures as zero for init optimization. This is important for init optimization that is implemented in processor. If a bit corresponding to an xstate in xstate_bv is 0, it means the xstate is in init status and will not be read from memory to the processor during XRSTOR/XRSTORS instruction. This largely impacts context switch performance. Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2fb4ec7f18b76e8cda057a8c0038def74a9b8044.1463760376.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 46abfafe61c8..dbfef1b7be7c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -329,13 +329,11 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_fpstate.xsave.header.xfeatures = xfeatures_mask; - } /* - * Init all the features state with header_bv being 0x0 + * Init all the features state with header.xfeatures being 0x0 */ copy_kernel_to_xregs_booting(&init_fpstate.xsave); -- cgit v1.2.3 From 99aa22d0d8f70d9317727ab40c85b2ead740a6ca Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 20 May 2016 10:47:08 -0700 Subject: x86/fpu/xstate: Copy xstate registers directly to the signal frame when compacted format is in use XSAVES is a kernel instruction and uses a compacted format. When working with user space, the kernel should provide standard-format, non-supervisor state data. We cannot do __copy_to_user() from a compacted-format kernel xstate area to a signal frame. Dave Hansen proposes this method to simplify copy xstate directly to user. This patch is based on an earlier patch from Fenghua Yu Originally-from: Fenghua Yu Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c36f419d525517d04209a28dd8e1e5af9000036e.1463760376.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/kernel/fpu/signal.c | 3 ++- arch/x86/kernel/fpu/xstate.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 16df2c44ac66..d812cf361282 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -47,5 +47,6 @@ extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); +int using_compacted_format(void); #endif diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 06d80f62c03f..8aa96cbb5dfb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -169,7 +170,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active()) { + if (fpregs_active() || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index dbfef1b7be7c..0b01f003df8b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -420,7 +420,7 @@ static int xfeature_size(int xfeature_nr) * that it is obvious which aspect of 'XSAVES' is being handled * by the calling code. */ -static int using_compacted_format(void) +int using_compacted_format(void) { return boot_cpu_has(X86_FEATURE_XSAVES); } -- cgit v1.2.3 From dbf984d825935f61965bcfacfd8e8dfdaf3e8051 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 25 Jun 2016 13:24:57 +0200 Subject: x86/boot/64: Add forgotten end of function marker Add secondary_startup_64()'s ENDPROC() marker. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160625112457.16930-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5df831ef1442..c7920ba69563 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -299,6 +299,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +ENDPROC(secondary_startup_64) #include "verify_cpu.S" -- cgit v1.2.3 From 1ead852dd88779eda12cb09cc894a03d9abfe1ec Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Jun 2016 19:13:49 +0200 Subject: x86/amd_nb: Fix boot crash on non-AMD systems Fix boot crash that triggers if this driver is built into a kernel and run on non-AMD systems. AMD northbridges users call amd_cache_northbridges() and it returns a negative value to signal that we weren't able to cache/detect any northbridges on the system. At least, it should do so as all its callers expect it to do so. But it does return a negative value only when kmalloc() fails. Fix it to return -ENODEV if there are no NBs cached as otherwise, amd_nb users like amd64_edac, for example, which relies on it to know whether it should load or not, gets loaded on systems like Intel Xeons where it shouldn't. Reported-and-tested-by: Tony Battersby Signed-off-by: Borislav Petkov Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1466097230-5333-2-git-send-email-bp@alien8.de Link: https://lkml.kernel.org/r/5761BEB0.9000807@cybernetics.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index a147e676fc7b..e991d5c8bb3a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -71,8 +71,8 @@ int amd_cache_northbridges(void) while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) i++; - if (i == 0) - return 0; + if (!i) + return -ENODEV; nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) -- cgit v1.2.3 From 06ee6d571f0e350253a8fc3492316b2be007fae2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:24 +0900 Subject: genirq: Add affinity hint to irq allocation Add an extra argument to the irq(domain) allocation functions, so we can hand down affinity hints to the allocator. Thats necessary to implement proper support for multiqueue devices. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/irq_64.c | 2 +- arch/x86/kernel/apic/io_apic.c | 5 +++-- include/linux/irq.h | 4 ++-- include/linux/irqdomain.h | 9 ++++++--- kernel/irq/ipi.c | 2 +- kernel/irq/irqdesc.c | 12 ++++++++---- kernel/irq/irqdomain.c | 22 ++++++++++++++-------- kernel/irq/manage.c | 7 ++++--- kernel/irq/msi.c | 3 ++- 9 files changed, 41 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index e22416ce56ea..34a7930b76ef 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -242,7 +242,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino) { int irq; - irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL); + irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL, NULL); if (irq <= 0) goto out; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 446702ed99dc..7c4f90dd4c2a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; diff --git a/include/linux/irq.h b/include/linux/irq.h index f6074813688d..39ce46ac5c18 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -708,11 +708,11 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner); + struct module *owner, const struct cpumask *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ - __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE) + __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 0, 1, node) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f1f36e04d885..1aee0fbe900e 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -39,6 +39,7 @@ struct irq_domain; struct of_device_id; struct irq_chip; struct irq_data; +struct cpumask; /* Number of irqs reserved for a legacy isa controller */ #define NUM_ISA_INTERRUPTS 16 @@ -217,7 +218,8 @@ extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); + irq_hw_number_t hwirq, int node, + const struct cpumask *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -389,7 +391,7 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc); + bool realloc, const struct cpumask *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); extern void irq_domain_activate_irq(struct irq_data *irq_data); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); @@ -397,7 +399,8 @@ extern void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) { - return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false); + return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false, + NULL); } extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..4fd23510d5f2 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -76,7 +76,7 @@ int irq_reserve_ipi(struct irq_domain *domain, } } - virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc descs\n"); return -ENOMEM; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..b8df4fcdbb5f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -223,7 +223,7 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int i; @@ -333,6 +333,7 @@ static void free_desc(unsigned int irq) } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + const struct cpumask *affinity, struct module *owner) { u32 i; @@ -453,12 +454,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask which hints where the + * irq descriptors should be allocated and which default + * affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) + struct module *owner, const struct cpumask *affinity) { int start, ret; @@ -494,7 +498,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); + return alloc_descs(start, cnt, node, affinity, owner); err: mutex_unlock(&sparse_irq_lock); @@ -512,7 +516,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_alloc_hwirqs(int cnt, int node) { - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); if (irq < 0) return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..79459b732dc9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -835,19 +835,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node) + int node, const struct cpumask *affinity) { unsigned int hint; if (virq >= 0) { - virq = irq_alloc_descs(virq, virq, cnt, node); + virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, + affinity); } else { hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_descs_from(hint, cnt, node); - if (virq <= 0 && hint > 1) - virq = irq_alloc_descs_from(1, cnt, node); + virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, + affinity); + if (virq <= 0 && hint > 1) { + virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, + affinity); + } } return virq; @@ -1160,6 +1164,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. @@ -1175,7 +1180,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc) + bool realloc, const struct cpumask *affinity) { int i, ret, virq; @@ -1193,7 +1198,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, if (realloc && irq_base >= 0) { virq = irq_base; } else { - virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, + affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 30658e9827f0..ad0aac6d1248 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -353,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. + * Preserve the managed affinity setting and an userspace affinity + * setup, but make sure that one of the targets is online. */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (irqd_affinity_is_managed(&desc->irq_data) || + irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb5bf2b50b07..58dbbacc6fbb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -334,7 +334,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, - dev_to_node(dev), &arg, false); + dev_to_node(dev), &arg, false, + NULL); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) -- cgit v1.2.3 From 920a4a70c55058a9997f2e35bf41503acf87c301 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:16 +0000 Subject: timers, x86/apic/uv: Initialize the UV heartbeat timer as pinned Pinned timers must carry the pinned attribute in the timer structure itself, so convert the code to the new API. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.133837204@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 29003154fafd..7a50519e6afc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -919,7 +919,7 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } static void uv_heartbeat_enable(int cpu) @@ -928,7 +928,7 @@ static void uv_heartbeat_enable(int cpu) struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_timer(timer, uv_heartbeat, cpu); + setup_pinned_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; -- cgit v1.2.3 From f9c287ba3861714a1959accf14e815c44291bec4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:17 +0000 Subject: timers, x86/mce: Initialize MCE restart timer as pinned Pinned timers must carry the pinned attribute in the timer structure itself, so convert the code to the new API. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.215783439@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37d97bf..b80a6361a9e1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1309,7 +1309,7 @@ static void __restart_timer(struct timer_list *t, unsigned long interval) if (timer_pending(t)) { if (time_before(when, t->expires)) - mod_timer_pinned(t, when); + mod_timer(t, when); } else { t->expires = round_jiffies(when); add_timer_on(t, smp_processor_id()); @@ -1735,7 +1735,7 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); - setup_timer(t, mce_timer_fn, cpu); + setup_pinned_timer(t, mce_timer_fn, cpu); mce_start_timer(cpu, t); } -- cgit v1.2.3 From 955d1427a91b18f53e082bd7c19c40ce13b0a0f4 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Fri, 8 Jul 2016 11:09:38 +0200 Subject: x86/mce/AMD: Increase size of the bank_map type Change bank_map type from 'char' to 'int' since we now have more than eight banks in a system. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1467968983-4874-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 10b0661651e0..7b7f3be783d4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = { EXPORT_SYMBOL_GPL(amd_df_mcablock_names); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); -- cgit v1.2.3 From 38c54ccb2ded3e93d8a353baeb7b9e12e1b77e23 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 8 Jul 2016 11:09:41 +0200 Subject: x86/mce: Fix mce_rdmsrl() warning message The MSR address we're dumping in there should be in hex, otherwise we get funsies like: [ 0.016000] WARNING: CPU: 1 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:428 mce_rdmsrl+0xd9/0xe0 [ 0.016000] mce: Unable to read msr -1073733631! ^^^^^^^^^^^ Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1467968983-4874-5-git-send-email-bp@alien8.de [ Fixed capitalization of 'MSR'. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37d97bf..58af6300992d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr) } if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); /* * Return zero in case the access faulted. This should * not happen normally but can happen if the CPU does -- cgit v1.2.3 From ef16dd0c2a523d2e3975bb1bea9f5727e3e7146f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 5 Jul 2016 00:31:25 +0200 Subject: x86/dumpstack: Honor supplied @regs arg The comment suggests that show_stack(NULL, NULL) should backtrace the current context, but the code doesn't match the comment. If regs are given, start the "Stack:" hexdump at regs->sp. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1467671487-10344-2-git-send-email-bp@alien8.de Link: http://lkml.kernel.org/r/efcd79bf4106d61f1cd258c2caa87f3a0618eeac.1466036668.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 4 +++- arch/x86/kernel/dumpstack_64.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index fef917e79b9d..948d77da3881 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -96,7 +96,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, int i; if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d558a8a49016..a81e1ef73bf2 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -264,7 +264,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, * back trace for this cpu: */ if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; -- cgit v1.2.3 From 81c2949f7fdcf8ff681326669afde24962232670 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Jul 2016 00:31:27 +0200 Subject: x86/dumpstack: Add show_stack_regs() and use it Add a helper to dump supplied pt_regs and use it in the MSR exception handling code to have precise stack traces pointing to the actual function causing the MSR access exception and not the stack frame of the exception handler itself. The new output looks like this: unchecked MSR access error: RDMSR from 0xdeadbeef at rIP: 0xffffffff8102ddb6 (early_init_intel+0x16/0x3a0) 00000000756e6547 ffffffff81c03f68 ffffffff81dd0940 ffffffff81c03f10 ffffffff81d42e65 0000000001000000 ffffffff81c03f58 ffffffff81d3e5a3 0000800000000000 ffffffff81800080 ffffffffffffffff 0000000000000000 Call Trace: [] early_cpu_init+0xe7/0x136 [] setup_arch+0xa5/0x9df [] start_kernel+0x9f/0x43a [] x86_64_start_reservations+0x2f/0x31 [] x86_64_start_kernel+0x168/0x176 Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1467671487-10344-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 + arch/x86/kernel/dumpstack.c | 5 +++++ arch/x86/mm/extable.c | 13 ++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index e5f5dc9787d5..1ef9d581b5d9 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_trace(struct task_struct *t, struct pt_regs *regs, unsigned long *sp, unsigned long bp); +extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ef8017ca5ba9..d66e5ac823b2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -197,6 +197,11 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_stack_log_lvl(task, NULL, sp, bp, ""); } +void show_stack_regs(struct pt_regs *regs) +{ + show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4bb53b89f3c5..fafc771568c7 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@ #include #include #include +#include typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -46,8 +47,9 @@ EXPORT_SYMBOL(ex_handler_ext); bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { - WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", - (unsigned int)regs->cx); + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); /* Pretend that the read succeeded and returned 0. */ regs->ip = ex_fixup_addr(fixup); @@ -60,9 +62,10 @@ EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { - WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", - (unsigned int)regs->cx, - (unsigned int)regs->dx, (unsigned int)regs->ax); + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); /* Pretend that the write succeeded. */ regs->ip = ex_fixup_addr(fixup); -- cgit v1.2.3 From 0483e1fa6e09d4948272680f691dccb1edb9677f Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:02 -0700 Subject: x86/mm: Implement ASLR for kernel memory regions Randomizes the virtual address space of kernel memory regions for x86_64. This first patch adds the infrastructure and does not randomize any region. The following patches will randomize the physical memory mapping, vmalloc and vmemmap regions. This security feature mitigates exploits relying on predictable kernel addresses. These addresses can be used to disclose the kernel modules base addresses or corrupt specific structures to elevate privileges bypassing the current implementation of KASLR. This feature can be enabled with the CONFIG_RANDOMIZE_MEMORY option. The order of each memory region is not changed. The feature looks at the available space for the regions based on different configuration options and randomizes the base and space between each. The size of the physical memory mapping is the available physical memory. No performance impact was detected while testing the feature. Entropy is generated using the KASLR early boot functions now shared in the lib directory (originally written by Kees Cook). Randomization is done on PGD & PUD page table levels to increase possible addresses. The physical memory mapping code was adapted to support PUD level virtual addresses. This implementation on the best configuration provides 30,000 possible virtual addresses in average for each memory region. An additional low memory page is used to ensure each CPU can start with a PGD aligned virtual address (for realmode). x86/dump_pagetable was updated to correctly display each region. Updated documentation on x86_64 memory layout accordingly. Performance data, after all patches in the series: Kernbench shows almost no difference (-+ less than 1%): Before: Average Optimal load -j 12 Run (std deviation): Elapsed Time 102.63 (1.2695) User Time 1034.89 (1.18115) System Time 87.056 (0.456416) Percent CPU 1092.9 (13.892) Context Switches 199805 (3455.33) Sleeps 97907.8 (900.636) After: Average Optimal load -j 12 Run (std deviation): Elapsed Time 102.489 (1.10636) User Time 1034.86 (1.36053) System Time 87.764 (0.49345) Percent CPU 1095 (12.7715) Context Switches 199036 (4298.1) Sleeps 97681.6 (1031.11) Hackbench shows 0% difference on average (hackbench 90 repeated 10 times): attemp,before,after 1,0.076,0.069 2,0.072,0.069 3,0.066,0.066 4,0.066,0.068 5,0.066,0.067 6,0.066,0.069 7,0.067,0.066 8,0.063,0.067 9,0.067,0.065 10,0.068,0.071 average,0.0677,0.0677 Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 4 ++ arch/x86/Kconfig | 17 +++++ arch/x86/include/asm/kaslr.h | 6 ++ arch/x86/include/asm/pgtable.h | 7 +- arch/x86/kernel/setup.c | 3 + arch/x86/mm/Makefile | 1 + arch/x86/mm/dump_pagetables.c | 16 +++-- arch/x86/mm/init.c | 1 + arch/x86/mm/kaslr.c | 152 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 202 insertions(+), 5 deletions(-) create mode 100644 arch/x86/mm/kaslr.c (limited to 'arch/x86/kernel') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5aa738346062..8c7dd5957ae1 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -39,4 +39,8 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. +Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all +physical memory, vmalloc/ioremap space and virtual memory map are randomized. +Their order is preserved but their base will be offset early at boot time. + -Andi Kleen, Jul 2004 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 930fe88095d3..9719b8eb38d3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1993,6 +1993,23 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config RANDOMIZE_MEMORY + bool "Randomize the kernel memory sections" + depends on X86_64 + depends on RANDOMIZE_BASE + default RANDOMIZE_BASE + ---help--- + Randomizes the base virtual address of kernel memory sections + (physical memory mapping, vmalloc & vmemmap). This security feature + makes exploits relying on predictable memory locations less reliable. + + The order of allocations remains unchanged. Entropy is generated in + the same way as RANDOMIZE_BASE. Current implementation in the optimal + configuration have in average 30,000 different possible virtual + addresses for each memory section. + + If unsure, say N. + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 5547438db5ea..683c9d736314 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -3,4 +3,10 @@ unsigned long kaslr_get_random_long(const char *purpose); +#ifdef CONFIG_RANDOMIZE_MEMORY +void kernel_randomize_memory(void); +#else +static inline void kernel_randomize_memory(void) { } +#endif /* CONFIG_RANDOMIZE_MEMORY */ + #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d455bef39e9c..5472682a307f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -732,11 +732,16 @@ void early_alloc_pgt_buf(void); #ifdef CONFIG_X86_64 /* Realmode trampoline initialization. */ extern pgd_t trampoline_pgd_entry; -static inline void __meminit init_trampoline(void) +static inline void __meminit init_trampoline_default(void) { /* Default trampoline pgd value */ trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; } +# ifdef CONFIG_RANDOMIZE_MEMORY +void __meminit init_trampoline(void); +# else +# define init_trampoline init_trampoline_default +# endif #else static inline void init_trampoline(void) { } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4e7b3991b60..a2616584b6e9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #include #include #include +#include /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -942,6 +943,8 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + kernel_randomize_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 62c0043a5fd5..96d2b847e09e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -37,4 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 99bfb192803f..9a17250bcbe0 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -72,9 +72,9 @@ static struct addr_marker address_markers[] = { { 0, "User Space" }, #ifdef CONFIG_X86_64 { 0x8000000000000000UL, "Kernel Space" }, - { PAGE_OFFSET, "Low Kernel Mapping" }, - { VMALLOC_START, "vmalloc() Area" }, - { VMEMMAP_START, "Vmemmap" }, + { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/* VMEMMAP_START */, "Vmemmap" }, # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif @@ -434,8 +434,16 @@ void ptdump_walk_pgd_level_checkwx(void) static int __init pt_dump_init(void) { + /* + * Various markers are not compile-time constants, so assign them + * here. + */ +#ifdef CONFIG_X86_64 + address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#endif #ifdef CONFIG_X86_32 - /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; # ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4252acdfcbbd..cc82830bc8c4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,6 +17,7 @@ #include #include /* for MAX_DMA_PFN */ #include +#include /* * We need to define the tracepoints somewhere, and tlb.c diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c new file mode 100644 index 000000000000..d5380a48e8fb --- /dev/null +++ b/arch/x86/mm/kaslr.c @@ -0,0 +1,152 @@ +/* + * This file implements KASLR memory randomization for x86_64. It randomizes + * the virtual address space of kernel memory regions (physical memory + * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates + * exploits relying on predictable kernel addresses. + * + * Entropy is generated using the KASLR early boot functions now shared in + * the lib directory (originally written by Kees Cook). Randomization is + * done on PGD & PUD page table levels to increase possible addresses. The + * physical memory mapping code was adapted to support PUD level virtual + * addresses. This implementation on the best configuration provides 30,000 + * possible virtual addresses in average for each memory region. An additional + * low memory page is used to ensure each CPU can start with a PGD aligned + * virtual address (for realmode). + * + * The order of each memory region is not changed. The feature looks at + * the available space for the regions based on different configuration + * options and randomizes the base and space between each. The size of the + * physical memory mapping is the available physical memory. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "mm_internal.h" + +#define TB_SHIFT 40 + +/* + * Virtual address start and end range for randomization. The end changes base + * on configuration to have the highest amount of space for randomization. + * It increases the possible random position for each randomized region. + * + * You need to add an if/def entry if you introduce a new memory region + * compatible with KASLR. Your entry must be in logical order with memory + * layout. For example, ESPFIX is before EFI because its virtual address is + * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * ensure that this order is correct and won't be changed. + */ +static const unsigned long vaddr_start; +static const unsigned long vaddr_end; + +/* + * Memory regions randomized by KASLR (except modules that use a separate logic + * earlier during boot). The list is ordered based on virtual addresses. This + * order is kept after randomization. + */ +static __initdata struct kaslr_memory_region { + unsigned long *base; + unsigned long size_tb; +} kaslr_regions[] = { +}; + +/* Get size in bytes used by the memory region */ +static inline unsigned long get_padding(struct kaslr_memory_region *region) +{ + return (region->size_tb << TB_SHIFT); +} + +/* + * Apply no randomization if KASLR was disabled at boot or if KASAN + * is enabled. KASAN shadow mappings rely on regions being PGD aligned. + */ +static inline bool kaslr_memory_enabled(void) +{ + return kaslr_enabled() && !config_enabled(CONFIG_KASAN); +} + +/* Initialize base and padding for each memory region randomized with KASLR */ +void __init kernel_randomize_memory(void) +{ + size_t i; + unsigned long vaddr = vaddr_start; + unsigned long rand; + struct rnd_state rand_state; + unsigned long remain_entropy; + + if (!kaslr_memory_enabled()) + return; + + /* Calculate entropy available between regions */ + remain_entropy = vaddr_end - vaddr_start; + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) + remain_entropy -= get_padding(&kaslr_regions[i]); + + prandom_seed_state(&rand_state, kaslr_get_random_long("Memory")); + + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) { + unsigned long entropy; + + /* + * Select a random virtual address using the extra entropy + * available. + */ + entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); + prandom_bytes_state(&rand_state, &rand, sizeof(rand)); + entropy = (rand % (entropy + 1)) & PUD_MASK; + vaddr += entropy; + *kaslr_regions[i].base = vaddr; + + /* + * Jump the region and add a minimum padding based on + * randomization alignment. + */ + vaddr += get_padding(&kaslr_regions[i]); + vaddr = round_up(vaddr + 1, PUD_SIZE); + remain_entropy -= entropy; + } +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + pud_t *pud_page, *pud_page_tramp; + int i; + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + pud_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + pud_page = (pud_t *) pgd_page_vaddr(*pgd); + + for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud, *pud_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + pud_tramp = pud_page_tramp + pud_index(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + *pud_tramp = *pud; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); +} -- cgit v1.2.3 From 021182e52fe01c1f7b126f97fd6ba048dc4234fd Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Jun 2016 17:47:03 -0700 Subject: x86/mm: Enable KASLR for physical mapping memory regions Add the physical mapping in the list of randomized memory regions. The physical memory mapping holds most allocations from boot and heap allocators. Knowing the base address and physical memory size, an attacker can deduce the PDE virtual address for the vDSO memory page. This attack was demonstrated at CanSecWest 2016, in the following presentation: "Getting Physical: Extreme Abuse of Intel Based Paged Systems": https://github.com/n3k/CansecWest2016_Getting_Physical_Extreme_Abuse_of_Intel_Based_Paging_Systems/blob/master/Presentation/CanSec2016_Presentation.pdf (See second part of the presentation). The exploits used against Linux worked successfully against 4.6+ but fail with KASLR memory enabled: https://github.com/n3k/CansecWest2016_Getting_Physical_Extreme_Abuse_of_Intel_Based_Paging_Systems/tree/master/Demos/Linux/exploits Similar research was done at Google leading to this patch proposal. Variants exists to overwrite /proc or /sys objects ACLs leading to elevation of privileges. These variants were tested against 4.6+. The page offset used by the compressed kernel retains the static value since it is not yet randomized during this boot stage. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Alexander Kuleshov Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Jan Beulich Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Xiao Guangrong Cc: Yinghai Lu Cc: kernel-hardening@lists.openwall.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1466556426-32664-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pagetable.c | 3 +++ arch/x86/include/asm/kaslr.h | 2 ++ arch/x86/include/asm/page_64_types.h | 11 ++++++++++- arch/x86/kernel/head_64.S | 2 +- arch/x86/mm/kaslr.c | 18 +++++++++++++++--- 5 files changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 6e31a6aac4d3..56589d0a804b 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -20,6 +20,9 @@ /* These actually do the work of building the kernel identity maps. */ #include #include +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE #include "../../mm/ident_map.c" /* Used by pgtable.h asm code to force instruction serialization. */ diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 683c9d736314..62b1b815a83a 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -4,6 +4,8 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY +extern unsigned long page_offset_base; + void kernel_randomize_memory(void); #else static inline void kernel_randomize_memory(void) { } diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d5c2f8b40faa..9215e0527647 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H +#ifndef __ASSEMBLY__ +#include +#endif + #ifdef CONFIG_KASAN #define KASAN_STACK_ORDER 1 #else @@ -32,7 +36,12 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#define __PAGE_OFFSET _AC(0xffff880000000000, UL) +#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) +#ifdef CONFIG_RANDOMIZE_MEMORY +#define __PAGE_OFFSET page_offset_base +#else +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#endif /* CONFIG_RANDOMIZE_MEMORY */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index c7920ba69563..9f8efc9f0075 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d5380a48e8fb..609ecf2b37ed 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -43,8 +43,12 @@ * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to * ensure that this order is correct and won't be changed. */ -static const unsigned long vaddr_start; -static const unsigned long vaddr_end; +static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; +static const unsigned long vaddr_end = VMALLOC_START; + +/* Default values */ +unsigned long page_offset_base = __PAGE_OFFSET_BASE; +EXPORT_SYMBOL(page_offset_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -55,6 +59,7 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { + { &page_offset_base, 64/* Maximum */ }, }; /* Get size in bytes used by the memory region */ @@ -77,13 +82,20 @@ void __init kernel_randomize_memory(void) { size_t i; unsigned long vaddr = vaddr_start; - unsigned long rand; + unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; if (!kaslr_memory_enabled()) return; + BUG_ON(kaslr_regions[0].base != &page_offset_base); + memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT); + + /* Adapt phyiscal memory region size based on available memory */ + if (memory_tb < kaslr_regions[0].size_tb) + kaslr_regions[0].size_tb = memory_tb; + /* Calculate entropy available between regions */ remain_entropy = vaddr_end - vaddr_start; for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) -- cgit v1.2.3 From fc5f3ac24720012909c224a63ca3217f4759967d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:43 -0400 Subject: Revert "x86/tsc: Add missing Cherrytrail frequency to the table" This reverts commit: e2724e9d9692 ("x86/tsc: Add missing Cherrytrail frequency to the table") ... as it is incomplete, and is replaced by a more complete patch later in this series. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/2199d0e959f7f71a18827268b5d060f8d3831639.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 9911a0620f9a..6aa0f4d9eea6 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -23,7 +23,6 @@ #include /* CPU reference clock frequency: in KHz */ -#define FREQ_80 80000 #define FREQ_83 83200 #define FREQ_100 99840 #define FREQ_133 133200 @@ -57,8 +56,6 @@ static struct freq_desc freq_desc_tables[] = { { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, /* ANN */ { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, - /* AIRMONT */ - { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } }, }; static int match_cpu(u8 family, u8 model) -- cgit v1.2.3 From ba8268330dc18d309a39175ea4d2c5d86c2cef09 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:44 -0400 Subject: x86/tsc_msr: Identify Intel-specific code try_msr_calibrate_tsc() is currently Intel-specific, and should not execute on any other vendor's parts. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1fe23c052826bdcfeb3d45045aa02246078cb5a7.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 6aa0f4d9eea6..4ec5e560ed73 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -86,6 +86,9 @@ unsigned long try_msr_calibrate_tsc(void) unsigned long res; int cpu_index; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) return 0; -- cgit v1.2.3 From 14bb4e34860af48ef1ea0f52b11611ce4db987fe Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:45 -0400 Subject: x86/tsc_msr: Remove debugging messages Debugging messages are not necessary after all of the possible hardware failures that never occur. Instead, this code can simply return 0. This code also doesn't need to print in the success case. tsc_init() already prints the TSC frequency, and apic=debug is available if anybody really is interested in printing the LAPIC frequency. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/cf03279a125b95dfa9b8d3d5b4a66de09cd04050.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 4ec5e560ed73..f7ba44b89cc4 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -76,9 +76,10 @@ static int match_cpu(u8 family, u8 model) (freq_desc_tables[cpu_index].freqs[freq_id]) /* - * Do MSR calibration only for known/supported CPUs. + * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Returns the calibration value or 0 if MSR calibration failed. + * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Return processor base frequency in KHz, or 0 on failure. */ unsigned long try_msr_calibrate_tsc(void) { @@ -100,31 +101,17 @@ unsigned long try_msr_calibrate_tsc(void) rdmsr(MSR_IA32_PERF_STATUS, lo, hi); ratio = (hi >> 8) & 0x1f; } - pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); - - if (!ratio) - goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); freq_id = lo & 0x7; freq = id_to_freq(cpu_index, freq_id); - pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", - freq_id, freq); - if (!freq) - goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; - pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; - pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif return res; - -fail: - pr_warn("Fast TSC calibration using MSR failed\n"); - return 0; } -- cgit v1.2.3 From 9e0cae9f6227f946fb0076b6a68c88156137f618 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:46 -0400 Subject: x86/tsc_msr: Update comments, expand definitions Syntax only, no functional change. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/8653a2dba21fef122fc7b29eafb750e2004d3976.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index f7ba44b89cc4..4110f723fd0f 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,14 +1,5 @@ /* - * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. - * - * TSC in Intel Atom SoC runs at a constant rate which can be figured - * by this formula: - * * - * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 - * for details. - * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR - * based calibration is the only option. - * + * tsc_msr.c - TSC frequency enumeration via MSR * * Copyright (C) 2013 Intel Corporation * Author: Bin Gao @@ -22,17 +13,10 @@ #include #include -/* CPU reference clock frequency: in KHz */ -#define FREQ_83 83200 -#define FREQ_100 99840 -#define FREQ_133 133200 -#define FREQ_166 166400 - #define MAX_NUM_FREQS 8 /* - * According to Intel 64 and IA-32 System Programming Guide, - * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the @@ -47,15 +31,15 @@ struct freq_desc { static struct freq_desc freq_desc_tables[] = { /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, /* CLV+ */ - { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, - /* TNG */ - { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, - /* VLV2 */ - { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, - /* ANN */ - { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, + { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, + /* TNG - Intel Atom processor Z3400 series */ + { 6, 0x4a, 1, { 0, 99840, 133200, 0, 0, 0, 0, 0 } }, + /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ + { 6, 0x37, 1, { 83200, 99840, 133200, 166400, 0, 0, 0, 0 } }, + /* ANN - Intel Atom processor Z3500 series */ + { 6, 0x5a, 1, { 83200, 99840, 133200, 99840, 0, 0, 0, 0 } }, }; static int match_cpu(u8 family, u8 model) -- cgit v1.2.3 From 05680e7fa8a4e700e031a5e72cd8c18265f0031a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:47 -0400 Subject: x86/tsc_msr: Correct Silvermont reference clock values Atom processors use a 19.2 MHz crystal oscillator. Early processors generate 100 MHz via 19.2 MHz * 26 / 5 = 99.84 MHz. Later preocessor generate 100 MHz via 19.2 MHz * 125 / 24 = 100 MHz. Update the Silvermont-based tables accordingly, matching the Software Developers Manual. Also, correct a 166 MHz entry that should have been 116 MHz, and add a missing 80 MHz entry. Reported-by: Stephane Gasparini Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/5d7561655dfb066ff10801b423405bae4d1cfbe2.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 4110f723fd0f..20487e2382c6 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -35,11 +35,11 @@ static struct freq_desc freq_desc_tables[] = { /* CLV+ */ { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 99840, 133200, 0, 0, 0, 0, 0 } }, + { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83200, 99840, 133200, 166400, 0, 0, 0, 0 } }, + { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83200, 99840, 133200, 99840, 0, 0, 0, 0 } }, + { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, }; static int match_cpu(u8 family, u8 model) -- cgit v1.2.3 From 6fcb41cdaee5056c96de88ee095bddd27a7697de Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:48 -0400 Subject: x86/tsc_msr: Add Airmont reference clock values per the Intel 64 and IA-32 Architecture Software Developer's Manual... Add the reference clock for Intel Atom Processors Based on the Airmont Microarchitecture. Reported-by: Stephane Gasparini Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/abc6a0f4b18281410da1a3f26e2819d8e03e144f.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_msr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 20487e2382c6..65b3d8cb8325 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -13,7 +13,7 @@ #include #include -#define MAX_NUM_FREQS 8 +#define MAX_NUM_FREQS 9 /* * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be @@ -40,6 +40,9 @@ static struct freq_desc freq_desc_tables[] = { { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, /* ANN - Intel Atom processor Z3500 series */ { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, + /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ + { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, + 80000, 93300, 90000, 88900, 87500 } }, }; static int match_cpu(u8 family, u8 model) -- cgit v1.2.3 From 03482e08a87d24e5c8c23e6981c482e832cf3bdc Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 17 Jun 2016 13:07:15 -0700 Subject: x86/fpu/xstate: Align xstate components according to CPUID CPUID function 0x0d, sub function (i, i > 1) returns in ecx[1] the alignment requirement of component 'i' when the compacted format is used. If ecx[1] is 0, component 'i' is located immediately following the preceding component. If ecx[1] is 1, component 'i' is located on the next 64-byte boundary following the preceding component. Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/331e2bef1a0a7a584f06adde095b6bbfbe166472.1466179491.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 60 +++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0b01f003df8b..7963029cb4ad 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -269,6 +269,33 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_PKRU); } +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + return !!(ecx & 2); +} + /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format @@ -306,10 +333,14 @@ static void __init setup_xstate_comp(void) else xstate_comp_sizes[i] = 0; - if (i > FIRST_EXTENDED_XFEATURE) + if (i > FIRST_EXTENDED_XFEATURE) { xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + xstate_comp_offsets[i] = + ALIGN(xstate_comp_offsets[i], 64); + } } } @@ -366,33 +397,6 @@ static int xfeature_is_user(int xfeature_nr) } */ -/* - * This check is important because it is easy to get XSTATE_* - * confused with XSTATE_BIT_*. - */ -#define CHECK_XFEATURE(nr) do { \ - WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ - WARN_ON(nr >= XFEATURE_MAX); \ -} while (0) - -/* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. - */ -static int xfeature_is_aligned(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; - - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - /* - * The value returned by ECX[1] indicates the alignment - * of state component i when the compacted format - * of the extended region of an XSAVE area is used - */ - return !!(ecx & 2); -} - static int xfeature_uncompacted_offset(int xfeature_nr) { u32 eax, ebx, ecx, edx; -- cgit v1.2.3 From 1499ce2dd45afddea2e84f9f920890cf88384c4e Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 17 Jun 2016 13:07:16 -0700 Subject: x86/fpu/xstate: Fix supervisor xstate component offset CPUID function 0x0d, sub function (i, i > 1) returns in ebx the offset of xstate component i. Zero is returned for a supervisor state. A supervisor state can only be saved by XSAVES and XSAVES uses a compacted format. There is no fixed offset for a supervisor state. This patch checks and makes sure a supervisor state offset is not recorded or mis-used. This has no effect in practice as we currently use no supervisor states, but it would be good to fix. Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/81b29e40d35d4cec9f2511a856fe769f34935a3f.1466179491.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 1 + arch/x86/include/asm/fpu/xstate.h | 3 ++ arch/x86/kernel/fpu/xstate.c | 62 ++++++++++++++++++++++++--------------- 3 files changed, 43 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 36b90bbfc69f..12dd648735b6 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -122,6 +122,7 @@ enum xfeature { #define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) #define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) +#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index d812cf361282..92f376ccc999 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -18,6 +18,9 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) +/* Supervisor features */ +#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) + /* Supported features which support lazy state saving */ #define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 7963029cb4ad..02786fb7a1e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -112,6 +112,27 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component 'i' is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + */ + u32 eax, ebx, ecx, edx; + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return !!(ecx & 1); +} + +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} + /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -230,7 +251,14 @@ static void __init setup_xstate_features(void) continue; cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_offsets[i] = ebx; + + /* + * If an xfeature is supervisor state, the offset + * in EBX is invalid. We leave it to -1. + */ + if (xfeature_is_user(i)) + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; /* * In our xstate size checks, we assume that the @@ -375,32 +403,20 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } -static int xfeature_is_supervisor(int xfeature_nr) -{ - /* - * We currently do not support supervisor states, but if - * we did, we could find out like this. - * - * SDM says: If state component i is a user state component, - * ECX[0] return 0; if state component i is a supervisor - * state component, ECX[0] returns 1. - u32 eax, ebx, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; - return !!(ecx & 1); - */ - return 0; -} -/* -static int xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} -*/ - static int xfeature_uncompacted_offset(int xfeature_nr) { u32 eax, ebx, ecx, edx; + /* + * Only XSAVES supports supervisor states and it uses compacted + * format. Checking a supervisor state's uncompacted offset is + * an error. + */ + if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) { + WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); + return -1; + } + CHECK_XFEATURE(xfeature_nr); cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); return ebx; -- cgit v1.2.3 From 91c3dba7dbc199191272f4a9863f86ea3bfd679f Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 17 Jun 2016 13:07:17 -0700 Subject: x86/fpu/xstate: Fix PTRACE frames for XSAVES XSAVES uses compacted format and is a kernel instruction. The kernel should use standard-format, non-supervisor state data for PTRACE. Signed-off-by: Yu-cheng Yu [ Edited away artificial linebreaks. ] Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/de3d80949001305fe389799973b675cab055c457.1466179491.git.yu-cheng.yu@intel.com [ Made various readability edits. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 5 +- arch/x86/kernel/fpu/regset.c | 52 +++++++---- arch/x86/kernel/fpu/xstate.c | 183 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 216 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 92f376ccc999..ae55a43e09c0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -51,5 +51,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); - +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave); +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 81422dfb152b..c114b132d121 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -4,6 +4,7 @@ #include #include #include +#include /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -85,21 +86,26 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_read(fpu); - xsave = &fpu->state.xsave; - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_read(fpu); + + if (using_compacted_format()) { + ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + } else { + fpstate_sanitize_xstate(fpu); + /* + * Copy the 48 bytes defined by the software into the xsave + * area in the thread struct, so that we can copy the whole + * area to user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } return ret; } @@ -114,11 +120,27 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + /* + * A whole standard-format XSAVE buffer is needed: + */ + if ((pos != 0) || (count < fpu_user_xstate_size)) + return -EFAULT; xsave = &fpu->state.xsave; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_write(fpu); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + ret = copyin_to_xsaves(kbuf, ubuf, xsave); + else + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + /* * mxcsr reserved bits must be masked to zero for security reasons. */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 02786fb7a1e8..56c0e707af21 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -697,7 +698,12 @@ void __init fpu__init_system_xstate(void) return; } - update_regset_xstate_info(fpu_kernel_xstate_size, xfeatures_mask); + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); @@ -925,16 +931,16 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* Set the bits we need in PKRU */ + /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; - /* Shift the bits in to the correct place in PKRU for pkey. */ + /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer */ + /* Locate old copy of the state in the xsave buffer: */ old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); /* @@ -947,9 +953,10 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, else new_pkru_state.pkru = old_pkru_state->pkru; - /* mask off any old bits in place */ + /* Mask off any old bits in place: */ new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - /* Set the newly-requested bits */ + + /* Set the newly-requested bits: */ new_pkru_state.pkru |= new_pkru_bits; /* @@ -963,8 +970,168 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ new_pkru_state.pad = 0; - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, - sizeof(new_pkru_state)); + fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + + return 0; +} + +/* + * This is similar to user_regset_copyout(), but will not add offset to + * the source data pointer or increment pos, count, kbuf, and ubuf. + */ +static inline int xstate_copyout(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) +{ + if ((count == 0) || (pos < start_pos)) + return 0; + + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } + } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a ptrace buffer. It supports partial copy but pos always starts from + * zero. This is called from xstateregs_get() and there we check the CPU + * has XSAVES. + */ +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(pos != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} + +/* + * Convert from a ptrace standard-format buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. + */ +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) +{ + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } + + /* + * Reject if the user sets any disabled or supervisor features: + */ + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= xfeatures; return 0; } -- cgit v1.2.3 From 996952e0148026ac0e512db5cad26e14f4267e8b Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 17 Jun 2016 13:07:18 -0700 Subject: x86/fpu/xstate: Fix XSTATE component offset print out Component offset print out was incorrect for XSAVES. Correct it and move to a separate function. Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/86602a8ac400626c6eca7125c3e15934866fc38e.1466179491.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 56c0e707af21..09bac979b8a2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -269,8 +269,6 @@ static void __init setup_xstate_features(void) WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; - - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -373,6 +371,21 @@ static void __init setup_xstate_comp(void) } } +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xstate_comp_offsets[i], i, xstate_sizes[i]); + } +} + /* * setup the xstate image representing the init state */ @@ -707,6 +720,7 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, -- cgit v1.2.3 From ac73b27aea4eacdd7555f664d5fc6e1d4d1c8bf6 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Fri, 17 Jun 2016 13:07:19 -0700 Subject: x86/fpu/xstate: Fix xstate_offsets, xstate_sizes for non-extended xstates The arrays xstate_offsets[] and xstate_sizes[] record XSAVE standard- format offsets and sizes. Values for non-extended state components fpu and xmm's were not initialized or used. Ptrace format conversion needs them. Fix it. Signed-off-by: Yu-cheng Yu Reviewed-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V. Shankar Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cf3ea36cf30e2a99e37da6483e65446d018ff0a7.1466179491.git.yu-cheng.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 09bac979b8a2..f8d1aff10f69 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -246,6 +246,15 @@ static void __init setup_xstate_features(void) /* start at the beginnning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[0] = 0; + xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); + xstate_offsets[1] = xstate_sizes[0]; + xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) -- cgit v1.2.3 From 447d29d1d3aed839e74c2401ef63387780ac51ed Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Apply nvidia_bugs quirk only on root bus Since the following commit: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") ... early quirks are only applied to devices on the root bus. The motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to reintroduce scanning of secondary buses for a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent regressions, open code the requirement to apply nvidia_bugs only on the root bus. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bca14c899137..256976fe2666 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -75,6 +75,13 @@ static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. -- cgit v1.2.3 From 850c321027c2e31d0afc71588974719a4b565550 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Reintroduce scanning of secondary buses We used to scan secondary buses until the following commit that was applied in 2009: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") which commit constrained early quirks to the root bus only. Its motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to add a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs, which is located on a secondary bus behind a PCIe root port. To facilitate that, reintroduce scanning of secondary buses. The commit message of 8659c406ade3 notes that scanning only the root bus "saves quite some unnecessary scanning work". The algorithm used prior to 8659c406ade3 was particularly time consuming because it scanned buses 0 to 31 brute force. To avoid lengthening boot time, employ a recursive strategy which only scans buses that are actually reachable from the root bus. Yinghai Lu pointed out that the secondary bus number read from a bridge's config space may be invalid, in particular a value of 0 would cause an infinite loop. The PCI core goes beyond that and recurses to a child bus only if its bus number is greater than the parent bus number (see pci_scan_bridge()). Since the root bus is numbered 0, this implies that secondary buses may not be 0. Do the same on early scanning. If this algorithm is found to significantly impact boot time or cause infinite loops on broken hardware, it would be possible to limit its recursion depth: The Broadcom 4331 quirk applies at depth 1, all others at depth 0, so the bus need not be scanned deeper than that for now. An alternative approach would be to revert to scanning only the root bus, and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12 and 8086:1e16. Apple always positioned the card behind either of these three ports. The quirk would then check presence of the card in slot 0 below the root port and do its deed. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 256976fe2666..ea60c05c2487 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -610,12 +610,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -648,6 +642,8 @@ static struct chipset early_qrk[] __initdata = { {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -656,7 +652,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -665,6 +661,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -692,25 +689,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} -- cgit v1.2.3 From abb2bafd295fe962bbadc329dbfb2146457283ac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Add early quirk to reset Apple AirPort card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EFI firmware on Macs contains a full-fledged network stack for downloading OS X images from osrecovery.apple.com. Unfortunately on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 wireless card on every boot and leaves it enabled even after ExitBootServices has been called. The card continues to assert its IRQ line, causing spurious interrupts if the IRQ is shared. It also corrupts memory by DMAing received packets, allowing for remote code execution over the air. This only stops when a driver is loaded for the wireless card, which may be never if the driver is not installed or blacklisted. The issue seems to be constrained to the Broadcom 4331. Chris Milsted has verified that the newer Broadcom 4360 built into the MacBookPro11,3 (2013/2014) does not exhibit this behaviour. The chances that Apple will ever supply a firmware fix for the older machines appear to be zero. The solution is to reset the card on boot by writing to a reset bit in its mmio space. This must be done as an early quirk and not as a plain vanilla PCI quirk to successfully combat memory corruption by DMAed packets: Matthew Garrett found out in 2012 that the packets are written to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). This type of memory is made available to the page allocator by efi_free_boot_services(). Plain vanilla PCI quirks run much later, in subsys initcall level. In-between a time window would be open for memory corruption. Random crashes occurring in this time window and attributed to DMAed packets have indeed been observed in the wild by Chris Bainbridge. When Matthew Garrett analyzed the memory corruption issue in 2012, he sought to fix it with a grub quirk which transitions the card to D3hot: http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 This approach does not help users with other bootloaders and while it may prevent DMAed packets, it does not cure the spurious interrupts emanating from the card. Unfortunately the card's mmio space is inaccessible in D3hot, so to reset it, we have to undo the effect of Matthew's grub patch and transition the card back to D0. Note that the quirk takes a few shortcuts to reduce the amount of code: The size of BAR 0 and the location of the PM capability is identical on all affected machines and therefore hardcoded. Only the address of BAR 0 differs between models. Also, it is assumed that the BCMA core currently mapped is the 802.11 core. The EFI driver seems to always take care of this. Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback towards finding the best solution to this problem. The following should be a comprehensive list of affected models: iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] For posterity, spurious interrupts caused by the Broadcom 4331 wireless card resulted in splats like this (stacktrace omitted): irq 17: nobody cared (try booting with the "irqpoll" option) handlers: [] pcie_isr [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] [] azx_interrupt [snd_hda_codec] Disabling IRQ #17 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 Tested-by: Konstantin Simanov # [MacBookPro8,1] Tested-by: Lukas Wunner # [MacBookPro9,1] Tested-by: Bryan Paradis # [MacBookPro9,2] Tested-by: Andrew Worsley # [MacBookPro10,1] Tested-by: Chris Bainbridge # [MacBookPro10,2] Signed-off-by: Lukas Wunner Acked-by: Rafał Miłecki Acked-by: Matt Fleming Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Milsted Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Garrett Cc: Michael Buesch Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: b43-dev@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: stable@vger.kernel.org Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Apply nvidia_bugs quirk only on root bus Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Reintroduce scanning of secondary buses Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de [ Did minor readability edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 64 ++++++++++++++++++++++++++++++++++++++++++ drivers/bcma/bcma_private.h | 2 -- include/linux/bcma/bcma.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ea60c05c2487..57b71373bae3 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include #include +#include +#include #include +#include +#include #include #include #include @@ -21,6 +25,9 @@ #include #include #include +#include + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -597,6 +604,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -639,6 +701,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index eda09090cb52..f642c4264c27 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -8,8 +8,6 @@ #include #include -#define BCMA_CORE_SIZE 0x1000 - #define bcma_err(bus, fmt, ...) \ pr_err("bus%d: " fmt, (bus)->num, ##__VA_ARGS__) #define bcma_warn(bus, fmt, ...) \ diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index e6b41f42602b..3db25df396cb 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -159,6 +159,7 @@ struct bcma_host_ops { #define BCMA_CORE_DEFAULT 0xFFF #define BCMA_MAX_NR_CORES 16 +#define BCMA_CORE_SIZE 0x1000 /* Chip IDs of PCIe devices */ #define BCMA_CHIP_ID_BCM4313 0x4313 -- cgit v1.2.3 From 44530d588e142a96cf0cd345a7cb8911c4f88720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 10 Jul 2016 20:58:36 +0200 Subject: Revert "perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86" This reverts commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a. Stephane reported the following regression: > Since Andi added: > > commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a > Author: Andi Kleen > Date: Thu Jun 9 06:14:38 2016 -0700 > > perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 > > $ perf stat -e ref-cycles ls > .... > > fails systematically because the ref-cycles is now used by the > watchdog and given this is a system-wide pinned event, it monopolizes > the fixed counter 2 which is the only counter able to measure this event. Since the next merge window is near, fix the regression for now by reverting the commit. Reported-by: Stephane Eranian Cc: Andi Kleen Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Vince Weaver Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 -------- include/linux/nmi.h | 1 - kernel/watchdog.c | 7 ------- 3 files changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 016f4263fad4..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,16 +18,8 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR -int hw_nmi_get_event(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return PERF_COUNT_HW_REF_CPU_CYCLES; - return PERF_COUNT_HW_CPU_CYCLES; -} - u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 79858af27209..4630eeae18e0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,7 +66,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); -int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8dd30fcd91be..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,12 +315,6 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* Can be overriden by architecture */ -__weak int hw_nmi_get_event(void) -{ - return PERF_COUNT_HW_CPU_CYCLES; -} - static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -610,7 +604,6 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.2.3 From 1fc2b67b43d5001b92b3a002b988884ad0137e99 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Mon, 11 Jul 2016 09:18:54 -0700 Subject: x86/fpu/xstate: Fix __fpu_restore_sig() for XSAVES When the kernel is using XSAVES compacted format, we cannot do __copy_from_user() from a signal frame, which has standard-format data. Fix it by using copyin_to_xsaves(), which converts between formats and filters out all supervisor states that we do not allow userspace to write. Signed-off-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Reviewed-by: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468253937-40008-2-git-send-email-fenghua.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8aa96cbb5dfb..9e231d88bb33 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -323,8 +323,15 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { + if (using_compacted_format()) { + err = copyin_to_xsaves(NULL, buf_fx, + &fpu->state.xsave); + } else { + err = __copy_from_user(&fpu->state.xsave, + buf_fx, state_size); + } + + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); err = -1; -- cgit v1.2.3 From 5060b91513b866f774da15dfd82157864c4b1683 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Mon, 11 Jul 2016 09:18:55 -0700 Subject: x86/fpu/xstate: Return NULL for disabled xstate component address It is an error to request a disabled XSAVE/XSAVES component address. For that case, make __raw_xsave_addr() return a NULL and issue a warning. Signed-off-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Reviewed-by: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468253937-40008-3-git-send-email-fenghua.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f8d1aff10f69..4fb8dd7697c5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -760,6 +760,11 @@ void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) { int feature_nr = fls64(xstate_feature_mask) - 1; + if (!xfeature_enabled(feature_nr)) { + WARN_ON_FPU(1); + return NULL; + } + return (void *)xsave + xstate_comp_offsets[feature_nr]; } /* -- cgit v1.2.3 From 35ac2d7ba787eb4b7418a5a6f5919c25e10a780a Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Mon, 11 Jul 2016 09:18:56 -0700 Subject: x86/fpu/xstate: Fix fpstate_init() for XRSTORS In XSAVES mode if fpstate_init() is used to initialize a task's extended state area, xsave.header.xcomp_bv[63] must be set. Otherwise, when the task is scheduled, a warning is triggered from copy_kernel_to_xregs(). One such test case is: setting an invalid extended state through PTRACE. When xstateregs_set() rejects the syscall and re-initializes the task's extended state area. This triggers the warning mentioned above. Signed-off-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Reviewed-by: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468253937-40008-4-git-send-email-fenghua.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 6 ++++++ arch/x86/kernel/fpu/core.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 12dd648735b6..48df486b02f9 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -231,6 +231,12 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); +/* + * xstate_header.xcomp_bv[63] indicates that the extended_state_area + * is in compacted format. + */ +#define XCOMP_BV_COMPACTED_FORMAT ((u64)1 << 63) + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c759bd01ec99..3fc03a09a93b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -229,6 +230,13 @@ void fpstate_init(union fpregs_state *state) memset(state, 0, fpu_kernel_xstate_size); + /* + * XRSTORS requires that this bit is set in xcomp_bv, or + * it will #GP. Make sure it is replaced after the memset(). + */ + if (static_cpu_has(X86_FEATURE_XSAVES)) + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; + if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else -- cgit v1.2.3 From b8be15d588060a03569ac85dc4a0247460988f5b Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Mon, 11 Jul 2016 09:18:57 -0700 Subject: x86/fpu/xstate: Re-enable XSAVES We did not handle XSAVES instructions correctly. There were issues in converting between standard and compacted format when interfacing with user-space. These issues have been corrected. Add a WARN_ONCE() to make it clear that XSAVES supervisor states are not yet implemented. Signed-off-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Reviewed-by: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468253937-40008-5-git-send-email-fenghua.yu@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 15 --------------- arch/x86/kernel/fpu/xstate.c | 9 +++++++++ 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 60f3839c5bfa..93982aebb398 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -230,21 +230,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) } fpu_user_xstate_size = fpu_kernel_xstate_size; - - /* - * Quirk: we don't yet handle the XSAVES* instructions - * correctly, as we don't correctly convert between - * standard and compacted format when interfacing - * with user-space - so disable it for now. - * - * The difference is small: with recent CPUs the - * compacted format is only marginally smaller than - * the standard FPU state format. - * - * ( This is easy to backport while we are fixing - * XSAVES* support. ) - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4fb8dd7697c5..3169bcaf9391 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -221,6 +221,15 @@ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; + /* + * Make it clear that XSAVES supervisor states are not yet + * implemented should anyone expect it to work by changing + * bits in XFEATURE_MASK_* macros and XCR0. + */ + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), + "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); -- cgit v1.2.3 From 02c0cd2dcf7fdc47d054b855b148ea8b82dbb7eb Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:50 -0400 Subject: x86/tsc_msr: Remove irqoff around MSR-based TSC enumeration Remove the irqoff/irqon around MSR-based TSC enumeration, as it is not necessary. Also rename: try_msr_calibrate_tsc() to cpu_khz_from_msr(), as that better describes what the routine does. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/a6b5c3ecd3b068175d2309599ab28163fc34215e.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 3 +-- arch/x86/kernel/tsc.c | 5 +---- arch/x86/kernel/tsc_msr.c | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 7428697c5b8d..db1f779a3766 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -52,7 +52,6 @@ extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); -/* MSR based TSC calibration for Intel Atom SoC platforms */ -unsigned long try_msr_calibrate_tsc(void); +unsigned long cpu_khz_from_msr(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 38ba6de56ede..35a3976c19cc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -674,10 +674,7 @@ unsigned long native_calibrate_tsc(void) unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - /* Calibrate TSC using MSR for Intel Atom SoCs */ - local_irq_save(flags); - fast_calibrate = try_msr_calibrate_tsc(); - local_irq_restore(flags); + fast_calibrate = cpu_khz_from_msr(); if (fast_calibrate) return fast_calibrate; diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 65b3d8cb8325..0fe720d64fef 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -68,7 +68,7 @@ static int match_cpu(u8 family, u8 model) * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy * Return processor base frequency in KHz, or 0 on failure. */ -unsigned long try_msr_calibrate_tsc(void) +unsigned long cpu_khz_from_msr(void) { u32 lo, hi, ratio, freq_id, freq; unsigned long res; -- cgit v1.2.3 From aa297292d708e89773b3b2cdcaf33f01bfa095d8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:51 -0400 Subject: x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID Skylake CPU base-frequency and TSC frequency may differ by up to 2%. Enumerate CPU and TSC frequencies separately, allowing cpu_khz and tsc_khz to differ. The existing CPU frequency calibration mechanism is unchanged. However, CPUID extensions are preferred, when available. CPUID.0x16 is preferred over MSR and timer calibration for CPU frequency discovery. CPUID.0x15 takes precedence over CPU-frequency for TSC frequency discovery. Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/b27ec289fd005833b27d694d9c2dbb716c5cdff7.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 1 + arch/x86/include/asm/x86_init.h | 4 ++- arch/x86/kernel/tsc.c | 75 +++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/x86_init.c | 1 + 4 files changed, 73 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index db1f779a3766..a30591e1567c 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -36,6 +36,7 @@ extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern int check_tsc_disabled(void); +extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4dcdf74dfed8..08a08a800e17 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -181,7 +181,8 @@ struct x86_legacy_features { /** * struct x86_platform_ops - platform specific runtime functions - * @calibrate_tsc: calibrate TSC + * @calibrate_cpu: calibrate CPU + * @calibrate_tsc: calibrate TSC, if different from CPU * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -200,6 +201,7 @@ struct x86_legacy_features { * semantics. */ struct x86_platform_ops { + unsigned long (*calibrate_cpu)(void); unsigned long (*calibrate_tsc)(void); void (*get_wallclock)(struct timespec *ts); int (*set_wallclock)(const struct timespec *ts); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 35a3976c19cc..e1496b79c28a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -239,7 +239,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu) { unsigned long long tsc_now, ns_now; struct cyc2ns_data *data; @@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_save(flags); sched_clock_idle_sleep_event(); - if (!cpu_khz) + if (!khz) goto done; data = cyc2ns_write_begin(cpu); @@ -261,7 +261,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -665,15 +665,72 @@ success: } /** - * native_calibrate_tsc - calibrate the tsc on boot + * native_calibrate_tsc + * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) +{ + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + unsigned int crystal_khz; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + crystal_khz = ecx_hz / 1000; + + if (crystal_khz == 0) { + switch (boot_cpu_data.x86_model) { + case 0x4E: /* SKL */ + case 0x5E: /* SKL */ + crystal_khz = 24000; /* 24 MHz */ + } + } + + return crystal_khz * ebx_numerator / eax_denominator; +} + +static unsigned long cpu_khz_from_cpuid(void) +{ + unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x16) + return 0; + + eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; + + cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + + return eax_base_mhz * 1000; +} + +/** + * native_calibrate_cpu - calibrate the cpu on boot + */ +unsigned long native_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; + fast_calibrate = cpu_khz_from_cpuid(); + if (fast_calibrate) + return fast_calibrate; + fast_calibrate = cpu_khz_from_msr(); if (fast_calibrate) return fast_calibrate; @@ -834,8 +891,10 @@ int recalibrate_cpu_khz(void) if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + if (tsc_khz == 0) + tsc_khz = cpu_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -1241,8 +1300,10 @@ void __init tsc_init(void) return; } + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + if (tsc_khz == 0) + tsc_khz = cpu_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); @@ -1262,7 +1323,7 @@ void __init tsc_init(void) */ for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(cpu_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu); } if (tsc_disabled > 0) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index dad5fe9633a3..58b459296e13 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { + .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, -- cgit v1.2.3 From ff4c86635ee12461fd3bd911d7d5253394da8f9d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 17 Jun 2016 01:22:52 -0400 Subject: x86/tsc: Enumerate BXT tsc_khz via CPUID Hard code the BXT crystal clock (aka ART - Always Running Timer) to 19.200 MHz, and use CPUID leaf 0x15 to determine the BXT TSC frequency. Use tsc_khz to sanity check BXT cpu_khz, which can be erroneous in some configurations. (I simplified the original patch from Bin Gao.) Original-From: Bin Gao Signed-off-by: Len Brown Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/bf4e7c175acd6d09719c47c319b10ff1f0627ff8.1466138954.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e1496b79c28a..2a952fcb1516 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -693,7 +693,11 @@ unsigned long native_calibrate_tsc(void) switch (boot_cpu_data.x86_model) { case 0x4E: /* SKL */ case 0x5E: /* SKL */ - crystal_khz = 24000; /* 24 MHz */ + crystal_khz = 24000; /* 24.0 MHz */ + break; + case 0x5C: /* BXT */ + crystal_khz = 19200; /* 19.2 MHz */ + break; } } @@ -895,6 +899,8 @@ int recalibrate_cpu_khz(void) tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -1302,8 +1308,16 @@ void __init tsc_init(void) cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); + + /* + * Trust non-zero tsc_khz as authorative, + * and use it to sanity check cpu_khz, + * which will be off if system timer is off. + */ if (tsc_khz == 0) tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); -- cgit v1.2.3 From 4d581259b7d44c8120a614b4e9244094c824d51f Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Thu, 14 Jul 2016 18:05:56 +0800 Subject: x86/reboot: Add Dell Optiplex 7450 AIO reboot quirk Dell Optiplex 7450 AIO works with BOOT_ACPI; however, the quirk for "OptiPlex 745" changes its boot method to BOOT_BIOS and causes 7450 AIO hangs when rebooting; as a result, 7450 AIO is appended to overwrite BOOT_BIOS by BOOT_ACPI in order not to break the original 745 series Signed-off-by: Alex Hung Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a9b31eb815f2..15ed70f8278b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -54,6 +54,19 @@ bool port_cf9_safe = false; * Dell Inc. so their systems "just work". :-) */ +/* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + /* * Some machines require the "reboot=b" or "reboot=k" commandline options, * this quirk makes that automatic. @@ -395,6 +408,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, /* Hewlett-Packard */ { /* Handle problems with rebooting on HP laptops */ -- cgit v1.2.3 From 9a2e9da3e003112399f2863b7b6b911043c01895 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:52 -0700 Subject: x86/dumpstack: Try harder to get a call trace on stack overflow If we overflow the stack, print_context_stack() will abort. Detect this case and rewind back into the valid part of the stack so that we can trace it. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee1690eb2715ccc5dc187fde94effa4ca0ccbbcd.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ef8017ca5ba9..cc88e25d73e9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -87,7 +87,7 @@ static inline int valid_stack_ptr(struct task_struct *task, else return 0; } - return p > t && p < t + THREAD_SIZE - size; + return p >= t && p < t + THREAD_SIZE - size; } unsigned long @@ -98,6 +98,14 @@ print_context_stack(struct task_struct *task, { struct stack_frame *frame = (struct stack_frame *)bp; + /* + * If we overflowed the stack into a guard page, jump back to the + * bottom of the usable stack. + */ + if ((unsigned long)task_stack_page(task) - (unsigned long)stack < + PAGE_SIZE) + stack = (unsigned long *)task_stack_page(task); + while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { unsigned long addr; -- cgit v1.2.3 From 98f30b1207932b6553ea605c99393d8afca12324 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:53 -0700 Subject: x86/dumpstack/64: Handle faults when printing the "Stack: " part of an OOPS If we overflow the stack into a guard page, we'll recursively fault when trying to dump the contents of the guard page. Use probe_kernel_address() so we can recover if this happens. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e626d47a55d7b04dcb1b4d33faa95e8505b217c8.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d558a8a49016..2552a1eadfed 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -272,6 +272,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { + unsigned long word; + if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); @@ -281,12 +283,18 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; } + + if (probe_kernel_address(stack, word)) + break; + if ((i % STACKSLOTS_PER_LINE) == 0) { if (i != 0) pr_cont("\n"); - printk("%s %016lx", log_lvl, *stack++); + printk("%s %016lx", log_lvl, word); } else - pr_cont(" %016lx", *stack++); + pr_cont(" %016lx", word); + + stack++; touch_nmi_watchdog(); } preempt_enable(); -- cgit v1.2.3 From 2deb4be28077638591fe5fc593b7f8aabc140f42 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:55 -0700 Subject: x86/dumpstack: When OOPSing, rewind the stack before do_exit() If we call do_exit() with a clean stack, we greatly reduce the risk of recursive oopses due to stack overflow in do_exit, and we allow do_exit to work even if we OOPS from an IST stack. The latter gives us a much better chance of surviving long enough after we detect a stack overflow to write out our logs. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/32f73ceb372ec61889598da5e5b145889b9f2e19.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 11 +++++++++++ arch/x86/entry/entry_64.S | 11 +++++++++++ arch/x86/kernel/dumpstack.c | 10 +++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 983e5d3a0d27..0b56666e6039 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1153,3 +1153,14 @@ ENTRY(async_page_fault) jmp error_code END(async_page_fault) #endif + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9ee0da1807ed..b846875aeea6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1423,3 +1423,14 @@ ENTRY(ignore_sysret) mov $-ENOSYS, %eax sysret END(ignore_sysret) + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cc88e25d73e9..de8242d8bb61 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -234,6 +234,8 @@ unsigned long oops_begin(void) EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); +void __noreturn rewind_stack_do_exit(int signr); + void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) @@ -255,7 +257,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + */ + rewind_stack_do_exit(signr); } NOKPROBE_SYMBOL(oops_end); -- cgit v1.2.3 From 13d4ea097d18b419ad2a2b696063d44bf59acec0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:57 -0700 Subject: x86/uaccess: Move thread_info::addr_limit to thread_struct struct thread_info is a legacy mess. To prepare for its partial removal, move thread_info::addr_limit out. As an added benefit, this way is simpler. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/15bee834d09402b47ac86f2feccdf6529f9bc5b0.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/checksum_32.h | 3 +-- arch/x86/include/asm/processor.h | 17 ++++++++++------- arch/x86/include/asm/thread_info.h | 7 ------- arch/x86/include/asm/uaccess.h | 6 +++--- arch/x86/kernel/asm-offsets.c | 4 +++- arch/x86/lib/copy_user_64.S | 8 ++++---- arch/x86/lib/csum-wrappers_64.c | 1 + arch/x86/lib/getuser.S | 20 ++++++++++---------- arch/x86/lib/putuser.S | 10 +++++----- arch/x86/lib/usercopy_64.c | 2 +- drivers/pnp/isapnp/proc.c | 2 +- lib/bitmap.c | 2 +- 12 files changed, 40 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 532f85e6651f..7b53743ed267 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -2,8 +2,7 @@ #define _ASM_X86_CHECKSUM_32_H #include - -#include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cbdfe5f76347..89314ed74fee 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -371,6 +371,10 @@ extern unsigned int xstate_size; struct perf_event; +typedef struct { + unsigned long seg; +} mm_segment_t; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -419,6 +423,8 @@ struct thread_struct { /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + mm_segment_t addr_limit; + unsigned int sig_on_uaccess_err:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -493,11 +499,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -typedef struct { - unsigned long seg; -} mm_segment_t; - - /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); @@ -719,6 +720,7 @@ static inline void spin_lock_prefetch(const void *x) .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .addr_limit = KERNEL_DS, \ } extern unsigned long thread_saved_pc(struct task_struct *tsk); @@ -768,8 +770,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX TASK_SIZE_MAX -#define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK \ +#define INIT_THREAD { \ + .sp0 = TOP_OF_INIT_STACK, \ + .addr_limit = KERNEL_DS, \ } /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 7c47bb659ecd..89bff044a6f5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - mm_segment_t addr_limit; }; #define INIT_THREAD_INFO(tsk) \ @@ -65,7 +64,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .addr_limit = KERNEL_DS, \ } #define init_thread_info (init_thread_union.thread_info) @@ -184,11 +182,6 @@ static inline unsigned long current_stack_pointer(void) # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* Load thread_info address into "reg" */ -#define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; - /* * ASM operand which evaluates to a 'thread_info' address of * the current task, if it is known that "reg" is exactly "off" diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8f66e5655c23..c03bfb68c503 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -29,12 +29,12 @@ #define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) #define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) +#define get_fs() (current->thread.addr_limit) +#define set_fs(x) (current->thread.addr_limit = (x)) #define segment_eq(a, b) ((a).seg == (b).seg) -#define user_addr_max() (current_thread_info()->addr_limit.seg) +#define user_addr_max() (current->thread.addr_limit.seg) #define __addr_ok(addr) \ ((unsigned long __force)(addr) < user_addr_max()) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 674134e9f5e5..2bd5c6ff7ee7 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -31,7 +31,9 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_addr_limit, thread_info, addr_limit); + + BLANK(); + OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 2b0ef26da0bd..bf603ebbfd8e 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,11 +17,11 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rdi,%rcx addq %rdx,%rcx jc bad_to_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_to_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ @@ -32,11 +32,11 @@ ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rsi,%rcx addq %rdx,%rcx jc bad_from_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_from_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 28a6654f0d08..b6fcb9a9ddbc 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include #include +#include #include /** diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 46668cda4ffd..0ef5128c2de8 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -35,8 +35,8 @@ .text ENTRY(__get_user_1) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 1: movzbl (%_ASM_AX),%edx @@ -48,8 +48,8 @@ ENDPROC(__get_user_1) ENTRY(__get_user_2) add $1,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 2: movzwl -1(%_ASM_AX),%edx @@ -61,8 +61,8 @@ ENDPROC(__get_user_2) ENTRY(__get_user_4) add $3,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 3: movl -3(%_ASM_AX),%edx @@ -75,8 +75,8 @@ ENTRY(__get_user_8) #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 4: movq -7(%_ASM_AX),%rdx @@ -86,8 +86,8 @@ ENTRY(__get_user_8) #else add $7,%_ASM_AX jc bad_get_user_8 - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user_8 ASM_STAC 4: movl -7(%_ASM_AX),%edx diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index e0817a12d323..c891ece81e5b 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -29,14 +29,14 @@ * as they get called from within inline assembly. */ -#define ENTER GET_THREAD_INFO(%_ASM_BX) +#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX #define EXIT ASM_CLAC ; \ ret .text ENTRY(__put_user_1) ENTER - cmp TI_addr_limit(%_ASM_BX),%_ASM_CX + cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user ASM_STAC 1: movb %al,(%_ASM_CX) @@ -46,7 +46,7 @@ ENDPROC(__put_user_1) ENTRY(__put_user_2) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -58,7 +58,7 @@ ENDPROC(__put_user_2) ENTRY(__put_user_4) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -70,7 +70,7 @@ ENDPROC(__put_user_4) ENTRY(__put_user_8) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0a42327a59d7..9f760cdcaf40 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -6,7 +6,7 @@ * Copyright 2002 Andi Kleen */ #include -#include +#include /* * Zero Userspace diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 5edee645d890..262285e48a09 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include extern struct pnp_protocol isapnp_protocol; diff --git a/lib/bitmap.c b/lib/bitmap.c index c66da508cbf7..eca88087fa8a 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include /* * bitmaps provide an array of bits, implemented using an an -- cgit v1.2.3 From fb59831b496a5bb7d0a06c7e702d88d1757edfca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:58 -0700 Subject: x86/smp: Remove stack_smp_processor_id() It serves no purpose -- raw_smp_processor_id() works fine. This change will be needed to move thread_info off the stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a2bf4f07fbc30fb32f9f7f3f8f94ad3580823847.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu.h | 1 - arch/x86/include/asm/smp.h | 6 ------ arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 678637ad7476..59d34c521d96 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -17,7 +17,6 @@ static inline void prefill_possible_map(void) {} #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define safe_smp_processor_id() 0 -#define stack_smp_processor_id() 0 #endif /* CONFIG_SMP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 66b057306f40..0576b6157f3a 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -172,12 +172,6 @@ extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) #define raw_smp_processor_id() (this_cpu_read(cpu_number)) -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) #define safe_smp_processor_id() smp_processor_id() #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0fe6953f421c..d22a7b9c4f0e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1452,7 +1452,7 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu = stack_smp_processor_id(); + int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); -- cgit v1.2.3 From eb43e8f85fffc1ba535e0362a872101dfe48abe3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:59 -0700 Subject: x86/smp: Remove unnecessary initialization of thread_info::cpu It's statically initialized to zero -- no need to dynamically initialize it to zero as well. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6cf6314dce3051371a913ee19d1b88e29c68c560.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923cac..0e91dbeca2fd 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1285,7 +1285,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); - current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); -- cgit v1.2.3 From c48ec42d6eae08f55685ab660f0743ed33b9f22a Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Fri, 15 Jul 2016 16:12:10 +0800 Subject: x86/tsc: Remove the unused check_tsc_disabled() check_tsc_disabled() was introduced by commit: c73deb6aecda ("perf/x86: Add ability to calculate TSC from perf sample timestamps") The only caller was arch_perf_update_userpage(), which had been refactored by commit: d8b11a0cbd1c ("perf/x86: Clean up cap_user_time* setting") ... so no need keep and export it any more. Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: a.p.zijlstra@chello.nl Cc: adrian.hunter@intel.com Cc: bp@suse.de Link: http://lkml.kernel.org/r/1468570330-25810-1-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 1 - arch/x86/kernel/tsc.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index a30591e1567c..33b6365c22fe 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -35,7 +35,6 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); -extern int check_tsc_disabled(void); extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2a952fcb1516..a804b5ab32d0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -335,12 +335,6 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); -int check_tsc_disabled(void) -{ - return tsc_disabled; -} -EXPORT_SYMBOL_GPL(check_tsc_disabled); - #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { -- cgit v1.2.3 From 102bb9fef68a21f357dc813d4792666c8295bc35 Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Thu, 14 Jul 2016 10:24:06 +0800 Subject: x86/apic: Remove the unused struct apic::apic_id_mask field The only user verify_local_APIC() had been removed by commit: 4399c03c6780 ("x86/apic: Remove verify_local_APIC()") ... so there is no need to keep it. Signed-off-by: Wei Jiangang Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: bsd@redhat.com Cc: david.vrabel@citrix.com Cc: jgross@suse.com Cc: konrad.wilk@oracle.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1468463046-20849-1-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 - arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 1 - arch/x86/kernel/apic/apic_numachip.c | 2 -- arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/x2apic_cluster.c | 1 - arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 1 - arch/x86/xen/apic.c | 1 - 10 files changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bc27611fa58f..f5befd4945f2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -300,7 +300,6 @@ struct apic { unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); - unsigned long apic_id_mask; int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, const struct cpumask *andmask, diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 76f89e2b245a..048747778d37 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -181,7 +181,6 @@ static struct apic apic_flat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -278,7 +277,6 @@ static struct apic apic_physflat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 13d19ed58514..2cebf59092d8 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -141,7 +141,6 @@ struct apic apic_noop = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index ab5c2c685a3c..714d4fda0d52 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -269,7 +269,6 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, @@ -321,7 +320,6 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cf9bd896c12d..06dbaa458bfe 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -171,7 +171,6 @@ static struct apic apic_bigsmp = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f316e34abb42..93edfa01b408 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -101,7 +101,6 @@ static struct apic apic_default = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index aca8b75c1552..24170d0809ba 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -270,7 +270,6 @@ static struct apic apic_x2apic_cluster = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a1242e2c12e6..4f13f54f1b1f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 29003154fafd..5a58c917179c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -582,7 +582,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index db52a7fafcc2..44c88ad1841a 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -177,7 +177,6 @@ static struct apic xen_pv_apic = { .get_apic_id = xen_get_apic_id, .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ - .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, -- cgit v1.2.3 From edce21216a8887bf06ba85ee49a00695e44c4341 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Jul 2016 09:53:52 +0200 Subject: x86/boot: Reorganize and clean up the BIOS area reservation code So the reserve_ebda_region() code has accumulated a number of problems over the years that make it really difficult to read and understand: - The calculation of 'lowmem' and 'ebda_addr' is an unnecessarily interleaved mess of first lowmem, then ebda_addr, then lowmem tweaks... - 'lowmem' here means 'super low mem' - i.e. 16-bit addressable memory. In other parts of the x86 code 'lowmem' means 32-bit addressable memory... This makes it super confusing to read. - It does not help at all that we have various memory range markers, half of which are 'start of range', half of which are 'end of range' - but this crucial property is not obvious in the naming at all ... gave me a headache trying to understand all this. - Also, the 'ebda_addr' name sucks: it highlights that it's an address (which is obvious, all values here are addresses!), while it does not highlight that it's the _start_ of the EBDA region ... - 'BIOS_LOWMEM_KILOBYTES' says a lot of things, except that this is the only value that is a pointer to a value, not a memory range address! - The function name itself is a misnomer: it says 'reserve_ebda_region()' while its main purpose is to reserve all the firmware ROM typically between 640K and 1MB, while the 'EBDA' part is only a small part of that ... - Likewise, the paravirt quirk flag name 'ebda_search' is misleading as well: this too should be about whether to reserve firmware areas in the paravirt case. - In fact thinking about this as 'end of RAM' is confusing: what this function *really* wants to reserve is firmware data and code areas! Once the thinking is inverted from a mixed 'ram' and 'reserved firmware area' notion to a pure 'reserved area' notion everything becomes a lot clearer. To improve all this rewrite the whole code (without changing the logic): - Firstly invert the naming from 'lowmem end' to 'BIOS reserved area start' and propagate this concept through all the variable names and constants. BIOS_RAM_SIZE_KB_PTR // was: BIOS_LOWMEM_KILOBYTES BIOS_START_MIN // was: INSANE_CUTOFF ebda_start // was: ebda_addr bios_start // was: lowmem BIOS_START_MAX // was: LOWMEM_CAP - Then clean up the name of the function itself by renaming it to reserve_bios_regions() and renaming the ::ebda_search paravirt flag to ::reserve_bios_regions. - Fix up all the comments (fix typos), harmonize and simplify their formulation and remove comments that become unnecessary due to the much better naming all around. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bios_ebda.h | 2 +- arch/x86/include/asm/x86_init.h | 4 +- arch/x86/kernel/ebda.c | 124 +++++++++++++++++++++++++------------- arch/x86/kernel/head32.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/platform-quirks.c | 4 +- 6 files changed, 88 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 2b00c776f223..4b7b8e71607e 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,7 +17,7 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -void reserve_ebda_region(void); +void reserve_bios_regions(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION /* diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 4dcdf74dfed8..c519c052700a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -168,14 +168,14 @@ struct x86_legacy_devices { * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present - * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * @reserve_bios_regions: it's safe to search for the EBDA signature in the hardware's * low RAM * @devices: legacy x86 devices, refer to struct x86_legacy_devices * documentation for further details. */ struct x86_legacy_features { int rtc; - int ebda_search; + int reserve_bios_regions; struct x86_legacy_devices devices; }; diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index afe65dffee80..6219eef20e2e 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -6,66 +6,104 @@ #include /* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. */ -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ +#define BIOS_RAM_SIZE_KB_PTR 0x413 -void __init reserve_ebda_region(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) { - unsigned int lowmem, ebda_addr; + unsigned int bios_start, ebda_start; /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. */ - if (!x86_platform.legacy.ebda_search) + if (!x86_platform.legacy.reserve_bios_regions) return; - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). + * Quirk: some old Dells seem to have a 4k EBDA without + * reporting so in their BIOS RAM size value, so just + * consider the memory above 640K to be off limits + * (bugzilla 2990). + * + * We detect this case by filtering for nonsensical EBDA + * addresses below 128K, where we can assume that they + * are bogus and bump it up to a fixed 640K value: */ + if (ebda_start < BIOS_START_MIN) + ebda_start = BIOS_START_MAX; - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; + /* + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: + */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K: + */ + if (bios_start < BIOS_START_MIN) + bios_start = BIOS_START_MAX; - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + /* + * Use the lower of the bios_start and ebda_start + * as the starting point, but don't allow it to + * go beyond 640K: + */ + bios_start = min(bios_start, ebda_start); + bios_start = min(bios_start, BIOS_START_MAX); - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d784bb547a9d..2dda0bc4576e 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -26,7 +26,7 @@ static void __init i386_default_early_setup(void) x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - reserve_ebda_region(); + reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b72fb0b71dd1..99d48e7d2974 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,7 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_ebda_region(); + reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b2f8a33b36ff..24a50301f150 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,12 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; - x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: - x86_platform.legacy.ebda_search = 1; + x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: -- cgit v1.2.3 From ec3ed4a2104b8d1ab8da2db5b1221b2ba8a7a6e1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 20 Jul 2016 12:45:51 -0700 Subject: x86/fpu: Do not BUG_ON() in early FPU code I don't think it is really possible to have a system where CPUID enumerates support for XSAVE but that it does not have FP/SSE (they are "legacy" features and always present). But, I did manage to hit this case in qemu when I enabled its somewhat shaky XSAVE support. The bummer is that the FPU is set up before we parse the command-line or have *any* console support including earlyprintk. That turned what should have been an easy thing to debug in to a bit more of an odyssey. So a BUG() here is worthless. All it does it guarantee that if/when we hit this case we have an empty console. So, remove the BUG() and try to limp along by disabling XSAVE and trying to continue. Add a comment on why we are doing this, and also add a common "out_disable" path for leaving fpu__init_system_xstate(). Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160720194551.63BB2B58@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3169bcaf9391..680049aa4593 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -714,8 +714,13 @@ void __init fpu__init_system_xstate(void) xfeatures_mask = eax + ((u64)edx << 32); if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); - BUG(); + goto out_disable; } xfeatures_mask &= fpu__get_supported_xfeatures_mask(); @@ -723,11 +728,8 @@ void __init fpu__init_system_xstate(void) /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); err = init_xstate_size(); - if (err) { - /* something went wrong, boot without any XSAVE support */ - fpu__init_disable_system_xstate(); - return; - } + if (err) + goto out_disable; /* * Update info used for ptrace frames; use standard-format size and no @@ -744,6 +746,11 @@ void __init fpu__init_system_xstate(void) xfeatures_mask, fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(); } /* -- cgit v1.2.3 From 6a79296cb15d947bcb4558011fe066e5d8252b35 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Jul 2016 14:16:52 -0700 Subject: x86/boot: Simplify EBDA-vs-BIOS reservation logic Both the intent and the effect of reserve_bios_regions() is simple: reserve the range from the apparent BIOS start (suitably filtered) through 1MB and, if the EBDA start address is sensible, extend that reservation downward to cover the EBDA as well. The code is overcomplicated, though, and contains head-scratchers like: if (ebda_start < BIOS_START_MIN) ebda_start = BIOS_START_MAX; That snipped is trying to say "if ebda_start < BIOS_START_MIN, ignore it". Simplify it: reorder the code so that it makes sense. This should have no functional effect under any circumstances. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Mario Limonciello Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/ef89c0c761be20ead8bd9a3275743e6259b6092a.1469135598.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/ebda.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index 6219eef20e2e..4312f8ae71b7 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -65,22 +65,6 @@ void __init reserve_bios_regions(void) if (!x86_platform.legacy.reserve_bios_regions) return; - /* Get the start address of the EBDA page: */ - ebda_start = get_bios_ebda(); - - /* - * Quirk: some old Dells seem to have a 4k EBDA without - * reporting so in their BIOS RAM size value, so just - * consider the memory above 640K to be off limits - * (bugzilla 2990). - * - * We detect this case by filtering for nonsensical EBDA - * addresses below 128K, where we can assume that they - * are bogus and bump it up to a fixed 640K value: - */ - if (ebda_start < BIOS_START_MIN) - ebda_start = BIOS_START_MAX; - /* * BIOS RAM size is encoded in kilobytes, convert it * to bytes to get a first guess at where the BIOS @@ -91,18 +75,22 @@ void __init reserve_bios_regions(void) /* * If bios_start is less than 128K, assume it is bogus - * and bump it up to 640K: + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. */ - if (bios_start < BIOS_START_MIN) + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) bios_start = BIOS_START_MAX; + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); + /* - * Use the lower of the bios_start and ebda_start - * as the starting point, but don't allow it to - * go beyond 640K: + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. */ - bios_start = min(bios_start, ebda_start); - bios_start = min(bios_start, BIOS_START_MAX); + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; /* Reserve all memory between bios_start and the 1MB mark: */ memblock_reserve(bios_start, 0x100000 - bios_start); -- cgit v1.2.3