From a930dc4543a2b213deb9fde12682716edff8a4a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Jan 2015 17:48:18 +0100 Subject: x86/asm: Cleanup prefetch primitives This is based on a patch originally by hpa. With the current improvements to the alternatives, we can simply use %P1 as a mem8 operand constraint and rely on the toolchain to generate the proper instruction sizes. For example, on 32-bit, where we use an empty old instruction we get: apply_alternatives: feat: 6*32+8, old: (c104648b, len: 4), repl: (c195566c, len: 4) c104648b: alt_insn: 90 90 90 90 c195566c: rpl_insn: 0f 0d 4b 5c ... apply_alternatives: feat: 6*32+8, old: (c18e09b4, len: 3), repl: (c1955948, len: 3) c18e09b4: alt_insn: 90 90 90 c1955948: rpl_insn: 0f 0d 08 ... apply_alternatives: feat: 6*32+8, old: (c1190cf9, len: 7), repl: (c1955a79, len: 7) c1190cf9: alt_insn: 90 90 90 90 90 90 90 c1955a79: rpl_insn: 0f 0d 0d a0 d4 85 c1 all with the proper padding done depending on the size of the replacement instruction the compiler generates. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin --- arch/x86/include/asm/processor.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..7be2c9a6caba 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -761,10 +761,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -775,10 +775,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -788,10 +787,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) -- cgit v1.2.3 From cbc82b17263877ea5d21e84c58ce03f0292458a1 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Fri, 23 Jan 2015 18:45:43 +0000 Subject: x86: Add support for Intel Cache QoS Monitoring (CQM) detection This patch adds support for the new Cache QoS Monitoring (CQM) feature found in future Intel Xeon processors. It includes the new values to track CQM resources to the cpuinfo_x86 structure, plus the CPUID detection routines for CQM. CQM allows a process, or set of processes, to be tracked by the CPU to determine the cache usage of that task group. Using this data from the CPU, software can be written to extract this data and report cache usage and occupancy for a particular process, or group of processes. More information about Cache QoS Monitoring can be found in the Intel (R) x86 Architecture Software Developer Manual, section 17.14. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Chris Webb Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Jacob Shin Cc: Jan Beulich Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Steven Honeyman Cc: Steven Rostedt Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-5-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 9 ++++++++- arch/x86/include/asm/processor.h | 3 +++ arch/x86/kernel/cpu/common.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..361922dcc9b1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #endif -#define NCAPINTS 11 /* N 32-bit words worth of info */ +#define NCAPINTS 13 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -226,6 +226,7 @@ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ @@ -242,6 +243,12 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..a12d50e04d7a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -109,6 +109,9 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ int x86_cache_size; int x86_cache_alignment; /* In bytes */ + /* Cache QoS architectural values: */ + int x86_cache_max_rmid; /* max index */ + int x86_cache_occ_scale; /* scale to bytes */ int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 07f2fc3c13a4..9fa00b2ea0ee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -645,6 +645,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[10] = eax; } + /* Additional Intel-defined flags: level 0x0000000F */ + if (c->cpuid_level >= 0x0000000F) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=0 */ + cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[11] = edx; + if (cpu_has(c, X86_FEATURE_CQM_LLC)) { + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = ebx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[12] = edx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } + } else { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + } + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -833,6 +857,20 @@ static void generic_identify(struct cpuinfo_x86 *c) detect_nopl(c); } +static void x86_init_cache_qos(struct cpuinfo_x86 *c) +{ + /* + * The heavy lifting of max_rmid and cache_occ_scale are handled + * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu + * in case CQM bits really aren't there in this CPU. + */ + if (c != &boot_cpu_data) { + boot_cpu_data.x86_cache_max_rmid = + min(boot_cpu_data.x86_cache_max_rmid, + c->x86_cache_max_rmid); + } +} + /* * This does the hard work of actually picking apart the CPU stuff... */ @@ -922,6 +960,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) init_hypervisor(c); x86_init_rdrand(c); + x86_init_cache_qos(c); /* * Clear/Set all flags overriden by options, need do it -- cgit v1.2.3 From 8ef46a672a7d852709561d10672b6eaa8a4acd82 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:02 -0800 Subject: x86/asm/entry: Add this_cpu_sp0() to read sp0 for the current cpu We currently store references to the top of the kernel stack in multiple places: kernel_stack (with an offset) and init_tss.x86_tss.sp0 (no offset). The latter is defined by hardware and is a clean canonical way to find the top of the stack. Add an accessor so we can start using it. This needs minor paravirt tweaks. On native, sp0 defines the top of the kernel stack and is therefore always correct. On Xen and lguest, the hypervisor tracks the top of the stack, but we want to start reading sp0 in the kernel. Fixing this is simple: just update our local copy of sp0 as well as the hypervisor's copy on task switches. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Rusty Russell Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8d675581859712bee09a055ed8f785d80dac1eca.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 5 +++++ arch/x86/kernel/process.c | 1 + arch/x86/lguest/boot.c | 1 + arch/x86/xen/enlighten.c | 1 + 4 files changed, 8 insertions(+) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7be2c9a6caba..71c3a826a690 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -564,6 +564,11 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long this_cpu_sp0(void) +{ + return this_cpu_read_stable(init_tss.x86_tss.sp0); +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..ff5c9088b1c5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,6 +38,7 @@ * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +EXPORT_PER_CPU_SYMBOL_GPL(init_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..8561585ee2c6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1076,6 +1076,7 @@ static void lguest_load_sp0(struct tss_struct *tss, { lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, THREAD_SIZE / PAGE_SIZE); + tss->x86_tss.sp0 = thread->sp0; } /* Let's just say, I wouldn't do debugging under a Guest. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..81665c9f2132 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; } static void xen_set_iopl_mask(unsigned mask) -- cgit v1.2.3 From 24933b82c0d9a711475a5ef7904eb733f561e637 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:05 -0800 Subject: x86/asm/entry: Rename 'init_tss' to 'cpu_tss' It has nothing to do with init -- there's only one TSS per cpu. Other names considered include: - current_tss: Confusing because we never switch the tss. - singleton_tss: Too long. This patch was generated with 's/init_tss/cpu_tss/g'. Followup patches will fix INIT_TSS and INIT_TSS_IST by hand. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/da29fb2a793e4f649d93ce2d1ed320ebe8516262.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/cpu/common.c | 6 +++--- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/ioport.c | 2 +- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/power/cpu.c | 2 +- 10 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 719db63b35c4..ad9efef65a6b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -113,7 +113,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(init_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp /* * No need to follow this irqs on/off section: the syscall * disabled irqs, here we enable it straight after entry: diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 71c3a826a690..117ee65473e2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -282,7 +282,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); /* * Save the original ist values for checking stack pointers during debugging @@ -566,7 +566,7 @@ static inline void native_swapgs(void) static inline unsigned long this_cpu_sp0(void) { - return this_cpu_read_stable(init_tss.x86_tss.sp0); + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); } #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..5d0f0cc7ea26 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -979,7 +979,7 @@ static void syscall32_cpu_init(void) void enable_sep_cpu(void) { int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); if (!boot_cpu_has(X86_FEATURE_SEP)) { put_cpu(); @@ -1307,7 +1307,7 @@ void cpu_init(void) */ load_ucode_ap(); - t = &per_cpu(init_tss, cpu); + t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1391,7 +1391,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 622ce4254893..0c00fd80249a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -959,7 +959,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) +#define INIT_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff5c9088b1c5..6f6087349231 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,8 +37,8 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; -EXPORT_PER_CPU_SYMBOL_GPL(init_tss); +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -110,7 +110,7 @@ void exit_thread(void) unsigned long *bp = t->io_bitmap_ptr; if (bp) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..d3460af3d27a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -248,7 +248,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 854b5981b327..2cd562f96c1f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -277,7 +277,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); @@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -134,7 +134,7 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_table(cpu); tss_desc tss; -- cgit v1.2.3 From d0a0de21f82bbc1737ea3c831f018d0c2bc6b9c2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:06 -0800 Subject: x86/asm/entry: Remove INIT_TSS and fold the definitions into 'cpu_tss' The INIT_TSS is unnecessary. Just define the initial TSS where 'cpu_tss' is defined. While we're at it, merge the 32-bit and 64-bit definitions. The only syntactic change is that 32-bit kernels were computing sp0 as long, but now they compute it as unsigned long. Verified by objdump: the contents and relocations of .data..percpu..shared_aligned are unchanged on 32-bit and 64-bit kernels. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8fc39fa3f6c5d635e93afbdd1a0fe0678a6d7913.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 20 -------------------- arch/x86/kernel/process.c | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 117ee65473e2..f5e3ec63767d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -818,22 +818,6 @@ static inline void spin_lock_prefetch(const void *x) .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) @@ -892,10 +876,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ } -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - /* * Return saved PC of a blocked thread. * What is this good for? it will be always the scheduler or ret_from_fork. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6f6087349231..f4c0af7fc3a0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,7 +37,25 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = (unsigned long)&init_stack + sizeof(init_stack), +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +#endif + }, +#ifdef CONFIG_X86_32 + /* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, +#endif +}; EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 -- cgit v1.2.3 From a7fcf28d431ef70afaa91496e64e16dc51dccec4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 6 Mar 2015 17:50:19 -0800 Subject: x86/asm/entry: Replace this_cpu_sp0() with current_top_of_stack() and fix it on x86_32 I broke 32-bit kernels. The implementation of sp0 was correct as far as I can tell, but sp0 was much weirder on x86_32 than I realized. It has the following issues: - Init's sp0 is inconsistent with everything else's: non-init tasks are offset by 8 bytes. (I have no idea why, and the comment is unhelpful.) - vm86 does crazy things to sp0. Fix it up by replacing this_cpu_sp0() with current_top_of_stack() and using a new percpu variable to track the top of the stack on x86_32. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Fixes: 75182b1632a8 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()") Link: http://lkml.kernel.org/r/d09dbe270883433776e0cbee3c7079433349e96d.1425692936.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 11 ++++++++++- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/kernel/cpu/common.c | 13 +++++++++++-- arch/x86/kernel/process_32.c | 11 +++++++---- arch/x86/kernel/smpboot.c | 2 ++ arch/x86/kernel/traps.c | 4 ++-- 6 files changed, 33 insertions(+), 12 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f5e3ec63767d..48a61c1c626e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -284,6 +284,10 @@ struct tss_struct { DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif + /* * Save the original ist values for checking stack pointers during debugging */ @@ -564,9 +568,14 @@ static inline void native_swapgs(void) #endif } -static inline unsigned long this_cpu_sp0(void) +static inline unsigned long current_top_of_stack(void) { +#ifdef CONFIG_X86_64 return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif } #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a2fa1899494e..7740edd56fed 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,9 +158,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_sp0() - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5d0f0cc7ea26..76348334b934 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1130,8 +1130,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; /* - * The following four percpu variables are hot. Align current_task to - * cacheline size such that all four fall in the same cacheline. + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. */ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; @@ -1226,6 +1226,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0405cab6634d..1b9963faf4eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -306,13 +306,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0. This changes current_thread_info(). + * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..759388c538cf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -806,6 +806,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa290586ed37..081252c44cde 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(this_cpu_sp0() - current_stack_pointer()) >= - THREAD_SIZE); + BUG_ON((unsigned long)(current_top_of_stack() - + current_stack_pointer()) >= THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } -- cgit v1.2.3 From 3ee4298f440c81638cbb5ec06f2497fb7a9a9eb4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:05:58 -0700 Subject: x86/asm/entry: Create and use a 'TOP_OF_KERNEL_STACK_PADDING' macro x86_32, unlike x86_64, pads the top of the kernel stack, because the hardware stack frame formats are variable in size. Document this padding and give it a name. This should make no change whatsoever to the compiled kernel image. It also doesn't fix any of the current bugs in this area. Signed-off-by: Andy Lutomirski Acked-by: Denys Vlasenko Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/02bf2f54b8dcb76a62a142b6dfe07d4ef7fc582e.1426009661.git.luto@amacapital.net [ Fixed small details, such as a missed magic constant in entry_32.S pointed out by Denys Vlasenko. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/thread_info.h | 27 +++++++++++++++++++++++++++ arch/x86/kernel/entry_32.S | 2 +- 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 48a61c1c626e..88d9aa745898 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -849,7 +849,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(task) \ ({ \ struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \ + TOP_OF_KERNEL_STACK_PADDING); \ __regs__ - 1; \ }) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 7740edd56fed..ba115eb6fbcf 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -12,6 +12,33 @@ #include #include +/* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + /* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e33ba51b1069..4c8cc34e6d68 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -398,7 +398,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax -- cgit v1.2.3 From d9e05cc5a53246e074dc2b84956252e4bbe392cd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:05:59 -0700 Subject: x86/asm/entry: Unify and fix initial thread_struct::sp0 values x86_32 and x86_64 need slightly different thread_struct::sp0 values, and x86_32's was incorrect for init. This never mattered -- the init thread never runs user code, so we never used thread_struct::sp0 for anything. Fix it and mostly unify them. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1b810c1d2e797e27bb4a7708c426101161edd1f6.1426009661.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 +++++-- arch/x86/kernel/process.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 88d9aa745898..fc6d8d0d8d53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -811,6 +811,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -821,7 +824,7 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ @@ -883,7 +886,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f4c0af7fc3a0..12b1cf606ddf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -39,7 +39,7 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = (unsigned long)&init_stack + sizeof(init_stack), + .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, -- cgit v1.2.3 From 76e4c4908a4904a61aa67ae5eb0b2a7588c4a546 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:06:00 -0700 Subject: x86/asm/entry/32: Document our abuse of x86_hw_tss::ss1 and x86_hw_tss::sp1 This has confused me for a while. Now that I figured it out, document it. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b7efc1b7364039824776f68e9ddee9ec1500e894.1426009661.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fc6d8d0d8d53..b26208998b7c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -209,9 +209,24 @@ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; - unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so sp1 and ss1 are convenient scratch + * spaces in the same cacheline as sp0. We use them to cache + * some MSR values to avoid unnecessary wrmsr instructions. + * + * We use SYSENTER_ESP to find sp0 and for the NMI emergency + * stack, but we need to context switch it because we do + * horrible things to the kernel stack in vm86 mode. + * + * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid + * corrupting the stack if we went through the sysenter path + * from vm86 mode. + */ + unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; -- cgit v1.2.3 From 5c39403e004bec75ce0c549541be5479595d6ad0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 13 Mar 2015 15:09:03 +0100 Subject: x86/asm/entry: Simplify task_pt_regs() macro definition Before this change, task_pt_regs() was using KSTK_TOP(), and it was the only use of that macro. In turn, KSTK_TOP used THREAD_SIZE_LONGS, and it was the only use of that macro too. Fold these macros into task_pt_regs(). Tweak comment about "- 8" - we now use a symbolic constant, not literal 8. Signed-off-by: Denys Vlasenko Reviewed-by: Steven Rostedt Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1426255743-5394-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b26208998b7c..6a5c0ec5ee0e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -847,15 +847,8 @@ static inline void spin_lock_prefetch(const void *x) extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -864,12 +857,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \ - TOP_OF_KERNEL_STACK_PADDING); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) -- cgit v1.2.3 From d828c71fba8922b116b4ec56c3e5bca8c822d5ae Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 9 Mar 2015 15:52:18 +0100 Subject: x86/asm/entry/32: Document the 32-bit SYSENTER "emergency stack" better Before the patch, the 'tss_struct::stack' field was not referenced anywhere. It was used only to set SYSENTER's stack to point after the last byte of tss_struct, thus the trailing field, stack[64], was used. But grep would not know it. You can comment it out, compile, and kernel will even run until an unlucky NMI corrupts io_bitmap[] (which is also not easily detectable). This patch changes code so that the purpose and usage of this field is not mysterious anymore, and can be easily grepped for. This does change generated code, for a subtle reason: since tss_struct is ____cacheline_aligned, there happens to be 5 longs of padding at the end. Old code was using the padding too; new code will strictly use it only for SYSENTER_stack[]. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425912738-559-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a5c0ec5ee0e..5abd9a535a24 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -291,9 +291,9 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -68,7 +68,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - sizeof(struct tss_struct)); + offsetofend(struct tss_struct, SYSENTER_stack)); #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 76348334b934..7a3dfb1db78d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -987,7 +987,7 @@ void enable_sep_cpu(void) } tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); -- cgit v1.2.3 From 9854dd74c3f6af8d9d527de86c6074b7ed0495f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: x86/asm/entry/64: Simplify 'old_rsp' usage Remove all manipulations of PER_CPU(old_rsp) in C code: - it is not used on SYSRET return anymore, and system entries are atomic, so updating it from the fork and context switch paths is pointless. - Tweak a few related comments as well: we no longer have a "partial stack frame" on entry, ever. Based on (split out of) patch from Denys Vlasenko. Originally-from: Denys Vlasenko Tested-by: Borislav Petkov Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426599779-8010-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 5 ----- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 7 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5abd9a535a24..3ac5092ec113 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -905,11 +905,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e8c124a1f885..59696d76a4e3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -238,7 +238,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -399,7 +398,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ prev->usersp = this_cpu_read(old_rsp); - this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); /* -- cgit v1.2.3 From ac9af4983e77765a642b5a21086bc1fdc55418c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: x86/asm/entry/64: Remove thread_struct::usersp Nothing uses thread_struct::usersp anymore, so remove it. Originally-from: Denys Vlasenko Tested-by: Borislav Petkov Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/process_64.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3ac5092ec113..572099710ba2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -493,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 59696d76a4e3..14df2be4711f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; @@ -235,7 +234,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); - current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -397,7 +395,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = this_cpu_read(old_rsp); this_cpu_write(current_task, next_p); /* -- cgit v1.2.3 From cf9328cc9989e028fdc64d8c0a7b1b043dc96735 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Apr 2015 12:41:45 -0700 Subject: x86/asm/entry/32: Stop caching MSR_IA32_SYSENTER_ESP in tss.sp1 We write a stack pointer to MSR_IA32_SYSENTER_ESP exactly once, and we unnecessarily cache the value in tss.sp1. We never read the cached value. Remove all of the caching. It serves no purpose. Suggested-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/05a0163eb33ef5208363f0015496855da7cebadd.1428002830.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 22 +++++++++++----------- arch/x86/kernel/cpu/common.c | 9 +++++---- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86/include/asm/processor.h') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 572099710ba2..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -209,21 +209,21 @@ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; + unsigned long sp1; /* - * We don't use ring 1, so sp1 and ss1 are convenient scratch - * spaces in the same cacheline as sp0. We use them to cache - * some MSR values to avoid unnecessary wrmsr instructions. + * We don't use ring 1, so ss1 is a convenient scratch space in + * the same cacheline as sp0. We use ss1 to cache the value in + * MSR_IA32_SYSENTER_CS. When we context switch + * MSR_IA32_SYSENTER_CS, we first check if the new value being + * written matches ss1, and, if it's not, then we wrmsr the new + * value and update ss1. * - * We use SYSENTER_ESP to find sp0 and for the NMI emergency - * stack, but we need to context switch it because we do - * horrible things to the kernel stack in vm86 mode. - * - * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid - * corrupting the stack if we went through the sysenter path - * from vm86 mode. + * The only reason we context switch MSR_IA32_SYSENTER_CS is + * that we set it to zero in vm86 tasks to avoid corrupting the + * stack if we were to go through the sysenter path from vm86 + * mode. */ - unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ unsigned short __ss1h; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71e4adcb15f1..a383d53bf0ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -976,15 +976,16 @@ void enable_sep_cpu(void) goto out; /* - * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware, - * we cache the SYSENTER CS and ESP values there for easy access: + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. */ tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, + (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); -- cgit v1.2.3